Commit 1b2e7884 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-sack-compression-changes'

Eric Dumazet says:

====================
tcp: sack compression changes

Patch series refines SACK compression.

We had issues with missing SACK when TCP option space is tight.

Uses hrtimer slack to improve performance.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3857c776 a70437cc
...@@ -651,6 +651,14 @@ tcp_comp_sack_delay_ns - LONG INTEGER ...@@ -651,6 +651,14 @@ tcp_comp_sack_delay_ns - LONG INTEGER
Default : 1,000,000 ns (1 ms) Default : 1,000,000 ns (1 ms)
tcp_comp_sack_slack_ns - LONG INTEGER
This sysctl control the slack used when arming the
timer used by SACK compression. This gives extra time
for small RTT flows, and reduces system overhead by allowing
opportunistic reduction of timer interrupts.
Default : 100,000 ns (100 us)
tcp_comp_sack_nr - INTEGER tcp_comp_sack_nr - INTEGER
Max number of SACK that can be compressed. Max number of SACK that can be compressed.
Using 0 disables SACK compression. Using 0 disables SACK compression.
......
...@@ -268,6 +268,7 @@ struct tcp_sock { ...@@ -268,6 +268,7 @@ struct tcp_sock {
} rack; } rack;
u16 advmss; /* Advertised MSS */ u16 advmss; /* Advertised MSS */
u8 compressed_ack; u8 compressed_ack;
u8 dup_ack_counter;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */ u8 chrono_type:2, /* current chronograph type */
......
...@@ -173,6 +173,7 @@ struct netns_ipv4 { ...@@ -173,6 +173,7 @@ struct netns_ipv4 {
int sysctl_tcp_rmem[3]; int sysctl_tcp_rmem[3];
int sysctl_tcp_comp_sack_nr; int sysctl_tcp_comp_sack_nr;
unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_delay_ns;
unsigned long sysctl_tcp_comp_sack_slack_ns;
struct inet_timewait_death_row tcp_death_row; struct inet_timewait_death_row tcp_death_row;
int sysctl_max_syn_backlog; int sysctl_max_syn_backlog;
int sysctl_tcp_fastopen; int sysctl_tcp_fastopen;
......
...@@ -1329,6 +1329,13 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -1329,6 +1329,13 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_doulongvec_minmax, .proc_handler = proc_doulongvec_minmax,
}, },
{
.procname = "tcp_comp_sack_slack_ns",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{ {
.procname = "tcp_comp_sack_nr", .procname = "tcp_comp_sack_nr",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
......
...@@ -4327,6 +4327,33 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) ...@@ -4327,6 +4327,33 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
} }
} }
static void tcp_sack_compress_send_ack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->compressed_ack)
return;
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
/* Since we have to send one ack finally,
* substract one from tp->compressed_ack to keep
* LINUX_MIB_TCPACKCOMPRESSED accurate.
*/
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
tp->compressed_ack - 1);
tp->compressed_ack = 0;
tcp_send_ack(sk);
}
/* Reasonable amount of sack blocks included in TCP SACK option
* The max is 4, but this becomes 3 if TCP timestamps are there.
* Given that SACK packets might be lost, be conservative and use 2.
*/
#define TCP_SACK_BLOCKS_EXPECTED 2
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
...@@ -4339,6 +4366,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) ...@@ -4339,6 +4366,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) { if (tcp_sack_extend(sp, seq, end_seq)) {
if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
tcp_sack_compress_send_ack(sk);
/* Rotate this_sack to the first one. */ /* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--) for (; this_sack > 0; this_sack--, sp--)
swap(*sp, *(sp - 1)); swap(*sp, *(sp - 1));
...@@ -4348,6 +4377,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) ...@@ -4348,6 +4377,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
} }
} }
if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
tcp_sack_compress_send_ack(sk);
/* Could not find an adjacent existing SACK, build a new one, /* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We * put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here. * always know there is at least one SACK present already here.
...@@ -4355,8 +4387,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) ...@@ -4355,8 +4387,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
* If the sack array is full, forget about the last one. * If the sack array is full, forget about the last one.
*/ */
if (this_sack >= TCP_NUM_SACKS) { if (this_sack >= TCP_NUM_SACKS) {
if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
tcp_send_ack(sk);
this_sack--; this_sack--;
tp->rx_opt.num_sacks--; tp->rx_opt.num_sacks--;
sp--; sp--;
...@@ -5275,15 +5305,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) ...@@ -5275,15 +5305,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
tp->compressed_ack_rcv_nxt = tp->rcv_nxt; tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tp->dup_ack_counter = 0;
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
tp->compressed_ack - TCP_FASTRETRANS_THRESH);
tp->compressed_ack = 0;
} }
if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) tp->dup_ack_counter++;
goto send_now; goto send_now;
}
tp->compressed_ack++;
if (hrtimer_is_queued(&tp->compressed_ack_timer)) if (hrtimer_is_queued(&tp->compressed_ack_timer))
return; return;
...@@ -5296,7 +5324,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) ...@@ -5296,7 +5324,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
rtt * (NSEC_PER_USEC >> 3)/20); rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk); sock_hold(sk);
hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
HRTIMER_MODE_REL_PINNED_SOFT); HRTIMER_MODE_REL_PINNED_SOFT);
} }
......
...@@ -2780,6 +2780,7 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -2780,6 +2780,7 @@ static int __net_init tcp_sk_init(struct net *net)
sizeof(init_net.ipv4.sysctl_tcp_wmem)); sizeof(init_net.ipv4.sysctl_tcp_wmem));
} }
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
......
...@@ -184,10 +184,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, ...@@ -184,10 +184,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { if (unlikely(tp->compressed_ack)) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
tp->compressed_ack - TCP_FASTRETRANS_THRESH); tp->compressed_ack);
tp->compressed_ack = TCP_FASTRETRANS_THRESH; tp->compressed_ack = 0;
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk); __sock_put(sk);
} }
......
...@@ -753,8 +753,14 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) ...@@ -753,8 +753,14 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
bh_lock_sock(sk); bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) { if (!sock_owned_by_user(sk)) {
if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) if (tp->compressed_ack) {
/* Since we have to send one ack finally,
* substract one from tp->compressed_ack to keep
* LINUX_MIB_TCPACKCOMPRESSED accurate.
*/
tp->compressed_ack--;
tcp_send_ack(sk); tcp_send_ack(sk);
}
} else { } else {
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
&sk->sk_tsq_flags)) &sk->sk_tsq_flags))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment