Commit cd86972a authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-undo-congestion'

Yuchung Cheng says:

====================
undo congestion window on spurious SYN or SYNACK timeout

Linux TCP currently uses the initial congestion window of 1 packet
if multiple SYN or SYNACK timeouts per RFC6298. However such
timeouts are often spurious on wireless or cellular networks that
experience high delay variances (e.g. ramping up dormant radios or
local link retransmission). Another case is when the underlying
path is longer than the default SYN timeout (e.g. 1 second). In
these cases starting the transfer with a minimal congestion window
is detrimental to the performance for short flows.

One naive approach is to simply ignore SYN or SYNACK timeouts and
always use a larger or default initial window. This approach however
risks pouring gas to the fire when the network is already highly
congested. This is particularly true in data center where application
could start thousands to millions of connections over a single or
multiple hosts resulting in high SYN drops (e.g. incast).

This patch-set detects spurious SYN and SYNACK timeouts upon
completing the handshake via the widely-supported TCP timestamp
options. Upon such events the sender reverts to the default
initial window to start the data transfer so it gets best of both
worlds. This patch-set supports this feature for both active and
passive as well as Fast Open or regular connections.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 6d1474a9 98fa6271
...@@ -457,18 +457,6 @@ void tcp_init_sock(struct sock *sk) ...@@ -457,18 +457,6 @@ void tcp_init_sock(struct sock *sk)
} }
EXPORT_SYMBOL(tcp_init_sock); EXPORT_SYMBOL(tcp_init_sock);
void tcp_init_transfer(struct sock *sk, int bpf_op)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_mtup_init(sk);
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
tcp_call_bpf(sk, bpf_op, 0, NULL);
tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
{ {
struct sk_buff *skb = tcp_write_queue_tail(sk); struct sk_buff *skb = tcp_write_queue_tail(sk);
......
...@@ -2252,7 +2252,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, ...@@ -2252,7 +2252,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
*/ */
static inline bool tcp_packet_delayed(const struct tcp_sock *tp) static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{ {
return !tp->retrans_stamp || return tp->retrans_stamp &&
tcp_tsopt_ecr_before(tp, tp->retrans_stamp); tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
} }
...@@ -3521,7 +3521,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) ...@@ -3521,7 +3521,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (rexmit == REXMIT_NONE) if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
return; return;
if (unlikely(rexmit == 2)) { if (unlikely(rexmit == 2)) {
...@@ -5647,6 +5647,32 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) ...@@ -5647,6 +5647,32 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
} }
EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_established);
void tcp_init_transfer(struct sock *sk, int bpf_op)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
tcp_mtup_init(sk);
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
/* Initialize the congestion window to start the transfer.
* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
* retransmitted. In light of RFC6298 more aggressive 1sec
* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
* retransmission has occurred.
*/
if (tp->total_retrans > 1 && tp->undo_marker)
tp->snd_cwnd = 1;
else
tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
tp->snd_cwnd_stamp = tcp_jiffies32;
tcp_call_bpf(sk, bpf_op, 0, NULL);
tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
...@@ -5748,6 +5774,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp) ...@@ -5748,6 +5774,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp)
#endif #endif
} }
static void tcp_try_undo_spurious_syn(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 syn_stamp;
/* undo_marker is set when SYN or SYNACK times out. The timeout is
* spurious if the ACK's timestamp option echo value matches the
* original SYN timestamp.
*/
syn_stamp = tp->retrans_stamp;
if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
syn_stamp == tp->rx_opt.rcv_tsecr)
tp->undo_marker = 0;
}
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th) const struct tcphdr *th)
{ {
...@@ -5815,6 +5856,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -5815,6 +5856,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_ecn_rcv_synack(tp, th); tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_try_undo_spurious_syn(sk);
tcp_ack(sk, skb, FLAG_SLOWPATH); tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Ok.. it's good. Set up sequence numbers and /* Ok.. it's good. Set up sequence numbers and
...@@ -5973,6 +6015,27 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -5973,6 +6015,27 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
return 1; return 1;
} }
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
tcp_try_undo_loss(sk, false);
inet_csk(sk)->icsk_retransmits = 0;
/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
* we no longer need req so release it.
*/
reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false);
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
* when new data has just been ack'ed.
*
* (TFO) - we could try to be more aggressive and
* retransmitting any data sooner based on when they
* are sent out.
*/
tcp_rearm_rto(sk);
}
/* /*
* This function implements the receiving procedure of RFC 793 for * This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT. * all states except ESTABLISHED and TIME_WAIT.
...@@ -6069,22 +6132,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ...@@ -6069,22 +6132,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!tp->srtt_us) if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req); tcp_synack_rtt_meas(sk, req);
/* Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) { if (req) {
inet_csk(sk)->icsk_retransmits = 0; tcp_rcv_synrecv_state_fastopen(sk);
reqsk_fastopen_remove(sk, req, false);
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
* when new data has just been ack'ed.
*
* (TFO) - we could try to be more aggressive and
* retransmitting any data sooner based on when they
* are sent out.
*/
tcp_rearm_rto(sk);
} else { } else {
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tp->copied_seq = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt;
} }
...@@ -6119,16 +6171,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ...@@ -6119,16 +6171,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_FIN_WAIT1: { case TCP_FIN_WAIT1: {
int tmo; int tmo;
/* If we enter the TCP_FIN_WAIT1 state and we are a if (req)
* Fast Open socket and this is the first acceptable tcp_rcv_synrecv_state_fastopen(sk);
* ACK we have received, this would have acknowledged
* our SYNACK so stop the SYNACK timer.
*/
if (req) {
/* We no longer need the request sock. */
reqsk_fastopen_remove(sk, req, false);
tcp_rearm_rto(sk);
}
if (tp->snd_una != tp->write_seq) if (tp->snd_una != tp->write_seq)
break; break;
...@@ -6303,7 +6348,7 @@ static void tcp_openreq_init(struct request_sock *req, ...@@ -6303,7 +6348,7 @@ static void tcp_openreq_init(struct request_sock *req,
req->cookie_ts = 0; req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = tcp_clock_us(); tcp_rsk(req)->snt_synack = 0;
tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp; req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
......
...@@ -512,16 +512,6 @@ void tcp_init_metrics(struct sock *sk) ...@@ -512,16 +512,6 @@ void tcp_init_metrics(struct sock *sk)
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
} }
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
* retransmitted. In light of RFC6298 more aggressive 1sec
* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
* retransmission has occurred.
*/
if (tp->total_retrans > 1)
tp->snd_cwnd = 1;
else
tp->snd_cwnd = tcp_init_cwnd(tp, dst);
tp->snd_cwnd_stamp = tcp_jiffies32;
} }
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
......
...@@ -522,6 +522,11 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -522,6 +522,11 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.ts_recent_stamp = 0; newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr); newtp->tcp_header_len = sizeof(struct tcphdr);
} }
if (req->num_timeout) {
newtp->undo_marker = treq->snt_isn;
newtp->retrans_stamp = div_u64(treq->snt_synack,
USEC_PER_SEC / TCP_TS_HZ);
}
newtp->tsoffset = treq->ts_off; newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/ newtp->md5sig_info = NULL; /*XXX*/
......
...@@ -3247,7 +3247,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -3247,7 +3247,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
skb->skb_mstamp_ns = cookie_init_timestamp(req); skb->skb_mstamp_ns = cookie_init_timestamp(req);
else else
#endif #endif
{
skb->skb_mstamp_ns = tcp_clock_ns(); skb->skb_mstamp_ns = tcp_clock_ns();
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
rcu_read_lock(); rcu_read_lock();
......
...@@ -393,6 +393,9 @@ static void tcp_fastopen_synack_timer(struct sock *sk) ...@@ -393,6 +393,9 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
tcp_write_err(sk); tcp_write_err(sk);
return; return;
} }
/* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
if (icsk->icsk_retransmits == 1)
tcp_enter_loss(sk);
/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
* returned from rtx_syn_ack() to make it more persistent like * returned from rtx_syn_ack() to make it more persistent like
* regular retransmit because if the child socket has been accepted * regular retransmit because if the child socket has been accepted
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment