Commit 718e14bb authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-RACK-fast-recovery'

Yuchung Cheng says:

====================
tcp: RACK fast recovery

The patch set enables RACK loss detection (draft-ietf-tcpm-rack-01)
to trigger fast recovery with a reordering timer.

Previously RACK has been running in auxiliary mode where it is
used to detect packet losses once the recovery has triggered by
other algorithms (e.g., FACK). By inspecting packet timestamps,
RACK can start ACK-driven repairs timely. A few similar heuristics
are no longer needed and are either removed or disabled to reduce
the complexity of the Linux TCP loss recovery engine:

  1. FACK (Forward Acknowledgement)
  2. Early Retransmit (RFC5827)
  3. thin_dupack (fast recovery on single DUPACK for thin-streams)
  4. NCR (Non-Congestion Robustness RFC4653) (RFC4653)
  5. Forward Retransmit

After this change, Linux's loss recovery algorithms consist of
  1. Conventional DUPACK threshold approach (RFC6675)
  2. RACK and Tail Loss Probe (draft-ietf-tcpm-rack-01)
  3. RTO plus F-RTO extension (RFC5682)

The patch set has been tested on Google servers extensively and
presented in several IETF meetings. The data suggests that RACK
successfully improves recovery performance:
https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-draft-ietf-tcpm-rack-01.pdf
https://www.ietf.org/proceedings/96/slides/slides-96-tcpm-3.pdf
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7410191a 94bdc978
...@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN ...@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
Allows TCP to send "duplicate" SACKs. Allows TCP to send "duplicate" SACKs.
tcp_early_retrans - INTEGER tcp_early_retrans - INTEGER
Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold Tail loss probe (TLP) converts RTOs occurring due to tail
for triggering fast retransmit when the amount of outstanding data is losses into fast recovery (draft-ietf-tcpm-rack). Note that
small and when no previously unsent data can be transmitted (such TLP requires RACK to function properly (see tcp_recovery below)
that limited transmit could be used). Also controls the use of
Tail loss probe (TLP) that converts RTOs occurring due to tail
losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
Possible values: Possible values:
0 disables ER 0 disables TLP
1 enables ER 3 or 4 enables TLP
2 enables ER but delays fast recovery and fast retransmit
by a fourth of RTT. This mitigates connection falsely
recovers when network has a small degree of reordering
(less than 3 packets).
3 enables delayed ER and TLP.
4 enables TLP only.
Default: 3 Default: 3
tcp_ecn - INTEGER tcp_ecn - INTEGER
...@@ -712,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN ...@@ -712,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN
Documentation/networking/tcp-thin.txt Documentation/networking/tcp-thin.txt
Default: 0 Default: 0
tcp_thin_dupack - BOOLEAN
Enable dynamic triggering of retransmissions after one dupACK
for thin streams. If set, a check is performed upon reception
of a dupACK to determine if the stream is thin (less than 4
packets in flight). As long as the stream is found to be thin,
data is retransmitted on the first received dupACK. This
improves retransmission latency for non-aggressive thin
streams, often found to be time-dependent.
For more information on thin streams, see
Documentation/networking/tcp-thin.txt
Default: 0
tcp_limit_output_bytes - INTEGER tcp_limit_output_bytes - INTEGER
Controls TCP Small Queue limit per tcp socket. Controls TCP Small Queue limit per tcp socket.
TCP bulk sender tends to increase packets in flight until it TCP bulk sender tends to increase packets in flight until it
......
...@@ -207,6 +207,8 @@ struct tcp_sock { ...@@ -207,6 +207,8 @@ struct tcp_sock {
/* Information of the most recently (s)acked skb */ /* Information of the most recently (s)acked skb */
struct tcp_rack { struct tcp_rack {
struct skb_mstamp mstamp; /* (Re)sent time of the skb */ struct skb_mstamp mstamp; /* (Re)sent time of the skb */
u32 rtt_us; /* Associated RTT */
u32 end_seq; /* Ending TCP sequence of the skb */
u8 advanced; /* mstamp advanced since last lost marking */ u8 advanced; /* mstamp advanced since last lost marking */
u8 reord; /* reordering detected */ u8 reord; /* reordering detected */
} rack; } rack;
...@@ -218,12 +220,11 @@ struct tcp_sock { ...@@ -218,12 +220,11 @@ struct tcp_sock {
unused:5; unused:5;
u8 nonagle : 4,/* Disable Nagle algorithm? */ u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */ thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */ unused1 : 1,
repair : 1, repair : 1,
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue; u8 repair_queue;
u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ u8 syn_data:1, /* SYN includes data */
syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
...@@ -305,7 +306,6 @@ struct tcp_sock { ...@@ -305,7 +306,6 @@ struct tcp_sock {
*/ */
int lost_cnt_hint; int lost_cnt_hint;
u32 retransmit_high; /* L-bits may be on up to this seqno */
u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 high_seq; /* snd_nxt at onset of congestion */ u32 high_seq; /* snd_nxt at onset of congestion */
......
...@@ -144,6 +144,7 @@ struct inet_connection_sock { ...@@ -144,6 +144,7 @@ struct inet_connection_sock {
#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */
#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */ #define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */
#define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */
#define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */
static inline struct inet_connection_sock *inet_csk(const struct sock *sk) static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{ {
...@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, ...@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
} }
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) { what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
what == ICSK_TIME_REO_TIMEOUT) {
icsk->icsk_pending = what; icsk->icsk_pending = what;
icsk->icsk_timeout = jiffies + when; icsk->icsk_timeout = jiffies + when;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
......
...@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
* for local resources. * for local resources.
*/ */
#define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */
#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
...@@ -261,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle; ...@@ -261,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_linear_timeouts;
extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_thin_dupack;
extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_early_retrans;
extern int sysctl_tcp_recovery;
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_challenge_ack_limit;
extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_tso_segs;
...@@ -397,6 +401,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -397,6 +401,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
int tcp_child_process(struct sock *parent, struct sock *child, int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb); struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk); void tcp_enter_loss(struct sock *sk);
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
void tcp_clear_retrans(struct tcp_sock *tp); void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk); void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk);
...@@ -541,6 +546,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); ...@@ -541,6 +546,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk); void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *); void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *); void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32); int tcp_trim_head(struct sock *, struct sk_buff *, u32);
int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
...@@ -559,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, ...@@ -559,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
const struct sk_buff *next_skb); const struct sk_buff *next_skb);
/* tcp_input.c */ /* tcp_input.c */
void tcp_resume_early_retransmit(struct sock *sk);
void tcp_rearm_rto(struct sock *sk); void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk); void tcp_reset(struct sock *sk);
...@@ -1031,23 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) ...@@ -1031,23 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
} }
/* TCP early-retransmit (ER) is similar to but more conservative than
* the thin-dupack feature. Enable ER only if thin-dupack is disabled.
*/
static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
{
struct net *net = sock_net((struct sock *)tp);
tp->do_early_retrans = sysctl_tcp_early_retrans &&
sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
net->ipv4.sysctl_tcp_reordering == 3;
}
static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
{
tp->do_early_retrans = 0;
}
static inline unsigned int tcp_left_out(const struct tcp_sock *tp) static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{ {
return tp->sacked_out + tp->lost_out; return tp->sacked_out + tp->lost_out;
...@@ -1856,17 +1844,11 @@ void tcp_v4_init(void); ...@@ -1856,17 +1844,11 @@ void tcp_v4_init(void);
void tcp_init(void); void tcp_init(void);
/* tcp_recovery.c */ /* tcp_recovery.c */
extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
/* Flags to enable various loss recovery features. See below */ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
extern int sysctl_tcp_recovery; const struct skb_mstamp *xmit_time,
const struct skb_mstamp *ack_time);
/* Use TCP RACK to detect (some) tail and retransmit losses */ extern void tcp_rack_reo_timeout(struct sock *sk);
#define TCP_RACK_LOST_RETRANS 0x1
extern int tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp,
const struct skb_mstamp *xmit_time, u8 sacked);
/* /*
* Save and compile IPv4 options, return a pointer to it * Save and compile IPv4 options, return a pointer to it
......
...@@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, ...@@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
} }
if (icsk->icsk_pending == ICSK_TIME_RETRANS || if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1; r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits; r->idiag_retrans = icsk->icsk_retransmits;
......
...@@ -536,13 +536,6 @@ static struct ctl_table ipv4_table[] = { ...@@ -536,13 +536,6 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "tcp_thin_dupack",
.data = &sysctl_tcp_thin_dupack,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{ {
.procname = "tcp_early_retrans", .procname = "tcp_early_retrans",
.data = &sysctl_tcp_early_retrans, .data = &sysctl_tcp_early_retrans,
......
...@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk) ...@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
tp->mss_cache = TCP_MSS_DEFAULT; tp->mss_cache = TCP_MSS_DEFAULT;
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
tcp_assign_congestion_control(sk); tcp_assign_congestion_control(sk);
tp->tsoffset = 0; tp->tsoffset = 0;
...@@ -2475,11 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, ...@@ -2475,11 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK: case TCP_THIN_DUPACK:
if (val < 0 || val > 1) if (val < 0 || val > 1)
err = -EINVAL; err = -EINVAL;
else {
tp->thin_dupack = val;
if (tp->thin_dupack)
tcp_disable_early_retrans(tp);
}
break; break;
case TCP_REPAIR: case TCP_REPAIR:
...@@ -2969,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, ...@@ -2969,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_THIN_LINEAR_TIMEOUTS: case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto; val = tp->thin_lto;
break; break;
case TCP_THIN_DUPACK: case TCP_THIN_DUPACK:
val = tp->thin_dupack; val = 0;
break; break;
case TCP_REPAIR: case TCP_REPAIR:
......
...@@ -79,7 +79,7 @@ ...@@ -79,7 +79,7 @@
int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_fack __read_mostly;
int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_max_reordering __read_mostly = 300;
int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_app_win __read_mostly = 31;
...@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly; ...@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3; int sysctl_tcp_early_retrans __read_mostly = 3;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
...@@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, ...@@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
tcp_disable_fack(tp); tcp_disable_fack(tp);
} }
if (metric > 0)
tcp_disable_early_retrans(tp);
tp->rack.reord = 1; tp->rack.reord = 1;
} }
...@@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) ...@@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
before(TCP_SKB_CB(skb)->seq, before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb; tp->retransmit_skb_hint = skb;
if (!tp->lost_out ||
after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
} }
/* Sum the number of packets on the wire we have marked as lost. /* Sum the number of packets on the wire we have marked as lost.
...@@ -1135,6 +1126,7 @@ struct tcp_sacktag_state { ...@@ -1135,6 +1126,7 @@ struct tcp_sacktag_state {
*/ */
struct skb_mstamp first_sackt; struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt; struct skb_mstamp last_sackt;
struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate; struct rate_sample *rate;
int flag; int flag;
}; };
...@@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk, ...@@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked; return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) { if (!(sacked & TCPCB_SACKED_ACKED)) {
tcp_rack_advance(tp, xmit_time, sacked); tcp_rack_advance(tp, sacked, end_seq,
xmit_time, &state->ack_time);
if (sacked & TCPCB_SACKED_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost, /* If the segment is not tagged as lost,
...@@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk) ...@@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct sk_buff *skb; struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */ bool is_reneg; /* is receiver reneging on SACKs? */
bool mark_lost; bool mark_lost;
...@@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk) ...@@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb); tp->lost_out += tcp_skb_pcount(skb);
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
} }
} }
tcp_verify_left_out(tp); tcp_verify_left_out(tp);
...@@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk) ...@@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk)
tp->high_seq = tp->snd_nxt; tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp); tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
* loss recovery is underway except recurring timeout(s) on * if a previous recovery is underway, otherwise it may incorrectly
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing * call a timeout spurious if some previously retransmitted packets
* are s/acked (sec 3.2). We do not apply that retriction since
* retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
* so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
* on PTMU discovery to avoid sending new data.
*/ */
tp->frto = sysctl_tcp_frto && tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
(new_recovery || icsk->icsk_retransmits) &&
!inet_csk(sk)->icsk_mtup.probe_size;
} }
/* If ACK arrived pointing to a remembered SACK, it means that our /* If ACK arrived pointing to a remembered SACK, it means that our
...@@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) ...@@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
} }
static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned long delay;
/* Delay early retransmit and entering fast recovery for
* max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
* available, or RTO is scheduled to fire first.
*/
if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
(flag & FLAG_ECE) || !tp->srtt_us)
return false;
delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
msecs_to_jiffies(2));
if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
return false;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
TCP_RTO_MAX);
return true;
}
/* Linux NewReno/SACK/FACK/ECN state machine. /* Linux NewReno/SACK/FACK/ECN state machine.
* -------------------------------------- * --------------------------------------
* *
...@@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) ...@@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* F.e. after RTO, when all the queue is considered as lost, * F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out. * lost_out = packets_out and in_flight = retrans_out.
* *
* Essentially, we have now two algorithms counting * Essentially, we have now a few algorithms detecting
* lost packets. * lost packets.
* *
* FACK: It is the simplest heuristics. As soon as we decided * If the receiver supports SACK:
*
* RFC6675/3517: It is the conventional algorithm. A packet is
* considered lost if the number of higher sequence packets
* SACKed is greater than or equal the DUPACK thoreshold
* (reordering). This is implemented in tcp_mark_head_lost and
* tcp_update_scoreboard.
*
* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
* (2017-) that checks timing instead of counting DUPACKs.
* Essentially a packet is considered lost if it's not S/ACKed
* after RTT + reordering_window, where both metrics are
* dynamically measured and adjusted. This is implemented in
* tcp_rack_mark_lost.
*
* FACK (Disabled by default. Subsumbed by RACK):
* It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed * that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e. * packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out. * lost_out = fackets_out - sacked_out and left_out = fackets_out.
...@@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) ...@@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* takes place. We use FACK by default until reordering * takes place. We use FACK by default until reordering
* is suspected on the path to this destination. * is suspected on the path to this destination.
* *
* NewReno: when Recovery is entered, we assume that one segment * If the receiver does not support SACK:
*
* NewReno (RFC6582): in Recovery we assume that one segment
* is lost (classic Reno). While we are in Recovery and * is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet * a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno * is lost (NewReno). This heuristics are the same in NewReno
* and SACK. * and SACK.
* *
* Imagine, that's all! Forget about all this shamanism about CWND inflation
* deflation etc. CWND is real congestion window, never inflated, changes
* only according to classic VJ rules.
*
* Really tricky (and requiring careful tuning) part of algorithm * Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and, * The first determines the moment _when_ we should reduce CWND and,
...@@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) ...@@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
static bool tcp_time_to_recover(struct sock *sk, int flag) static bool tcp_time_to_recover(struct sock *sk, int flag)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
/* Trick#1: The loss is proven. */ /* Trick#1: The loss is proven. */
if (tp->lost_out) if (tp->lost_out)
...@@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) ...@@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
if (tcp_dupack_heuristics(tp) > tp->reordering) if (tcp_dupack_heuristics(tp) > tp->reordering)
return true; return true;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
!tcp_may_send_now(sk)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
return true;
}
/* If a thin stream is detected, retransmit after first
* received dupack. Employ only if SACK is supported in order
* to avoid possible corner-case series of spurious retransmissions
* Use only if there are no unsent data.
*/
if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
tcp_is_sack(tp) && !tcp_send_head(sk))
return true;
/* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
* retransmissions due to small network reorderings, we implement
* Mitigation A.3 in the RFC and delay the retransmission for a short
* interval if appropriate.
*/
if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
(tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
!tcp_may_send_now(sk))
return !tcp_pause_early_retransmit(sk, flag);
return false; return false;
} }
...@@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) ...@@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp); tcp_ecn_queue_cwr(tp);
} }
static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
int flag)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0; int sndcnt = 0;
...@@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk) ...@@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk)
} }
EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_simple_retransmit);
static void tcp_enter_recovery(struct sock *sk, bool ece_ack) void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int mib_idx; int mib_idx;
...@@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, ...@@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
tcp_try_undo_loss(sk, false)) tcp_try_undo_loss(sk, false))
return; return;
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ /* The ACK (s)acks some never-retransmitted data meaning not all
/* Step 3.b. A timeout is spurious if not all data are * the data packets before the timeout were lost. Therefore we
* lost, i.e., never-retransmitted data are (s)acked. * undo the congestion window and state. This is essentially
* the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
* a retransmitted skb is permantly marked, we can apply such an
* operation even if F-RTO was not used.
*/ */
if ((flag & FLAG_ORIG_SACK_ACKED) && if ((flag & FLAG_ORIG_SACK_ACKED) &&
tcp_try_undo_loss(sk, true)) tcp_try_undo_loss(sk, tp->undo_marker))
return; return;
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
if (after(tp->snd_nxt, tp->high_seq)) { if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || is_dupack) if (flag & FLAG_DATA_SACKED || is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */ tp->frto = 0; /* Step 3.a. loss was real */
...@@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) ...@@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
return false; return false;
} }
static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
const struct skb_mstamp *ack_time)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
u32 prior_retrans = tp->retrans_out;
tcp_rack_mark_lost(sk, ack_time);
if (prior_retrans > tp->retrans_out)
*ack_flag |= FLAG_LOST_RETRANS;
}
}
/* Process an event, which can update packets-in-flight not trivially. /* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out, * Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and * taking into account both packets sitting in receiver's buffer and
...@@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) ...@@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
* tcp_xmit_retransmit_queue(). * tcp_xmit_retransmit_queue().
*/ */
static void tcp_fastretrans_alert(struct sock *sk, const int acked, static void tcp_fastretrans_alert(struct sock *sk, const int acked,
bool is_dupack, int *ack_flag, int *rexmit) bool is_dupack, int *ack_flag, int *rexmit,
const struct skb_mstamp *ack_time)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
...@@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, ...@@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
} }
} }
/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
tcp_rack_mark_lost(sk)) {
flag |= FLAG_LOST_RETRANS;
*ack_flag |= FLAG_LOST_RETRANS;
}
/* E. Process state. */ /* E. Process state. */
switch (icsk->icsk_ca_state) { switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery: case TCP_CA_Recovery:
...@@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, ...@@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_keep_open(sk); tcp_try_keep_open(sk);
return; return;
} }
tcp_rack_identify_loss(sk, ack_flag, ack_time);
break; break;
case TCP_CA_Loss: case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit); tcp_process_loss(sk, flag, is_dupack, rexmit);
if (icsk->icsk_ca_state != TCP_CA_Open && tcp_rack_identify_loss(sk, ack_flag, ack_time);
!(flag & FLAG_LOST_RETRANS)) if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return; return;
/* Change state if cwnd is undone or retransmits are lost */ /* Change state if cwnd is undone or retransmits are lost */
default: default:
...@@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, ...@@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder) if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk); tcp_try_undo_dsack(sk);
tcp_rack_identify_loss(sk, ack_flag, ack_time);
if (!tcp_time_to_recover(sk, flag)) { if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag); tcp_try_to_open(sk, flag);
return; return;
...@@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk) ...@@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk)
} else { } else {
u32 rto = inet_csk(sk)->icsk_rto; u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */ /* Offset the time elapsed after installing regular RTO */
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk); struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp = const u32 rto_time_stamp =
...@@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk) ...@@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk)
} }
} }
/* This function is called when the delayed ER timer fires. TCP enters
* fast recovery and performs fast-retransmit.
*/
void tcp_resume_early_retransmit(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tcp_rearm_rto(sk);
/* Stop if ER is disabled after the delayed ER timer is scheduled */
if (!tp->do_early_retrans)
return;
tcp_enter_recovery(sk, false);
tcp_update_scoreboard(sk, 1);
tcp_xmit_retransmit_queue(sk);
}
/* If we get here, the whole TSO packet has not been acked. */ /* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{ {
...@@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, ...@@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/ */
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked, u32 prior_snd_una, int *acked,
struct tcp_sacktag_state *sack, struct tcp_sacktag_state *sack)
struct skb_mstamp *now)
{ {
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt; struct skb_mstamp first_ackt, last_ackt;
struct skb_mstamp *now = &sack->ack_time;
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out; u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out; u32 reord = tp->packets_out;
...@@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ...@@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
} else if (tcp_is_sack(tp)) { } else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount; tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb)) if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, &skb->skb_mstamp, sacked); tcp_rack_advance(tp, sacked, scb->end_seq,
&skb->skb_mstamp,
&sack->ack_time);
} }
if (sacked & TCPCB_LOST) if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount; tp->lost_out -= acked_pcount;
...@@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 lost = tp->lost; u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */ int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
struct skb_mstamp now;
sack_state.first_sackt.v64 = 0; sack_state.first_sackt.v64 = 0;
sack_state.rate = &rs; sack_state.rate = &rs;
...@@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt)) if (after(ack, tp->snd_nxt))
goto invalid_ack; goto invalid_ack;
skb_mstamp_get(&now); skb_mstamp_get(&sack_state.ack_time);
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk); tcp_rearm_rto(sk);
if (after(ack, prior_snd_una)) { if (after(ack, prior_snd_una)) {
...@@ -3689,11 +3634,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3689,11 +3634,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */ /* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state, &now); &sack_state);
if (tcp_ack_is_dubious(sk, flag)) { if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
&sack_state.ack_time);
} }
if (tp->tlp_high_seq) if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag); tcp_process_tlp_ack(sk, ack, flag);
...@@ -3708,15 +3654,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3708,15 +3654,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_schedule_loss_probe(sk); tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */ lost = tp->lost - lost; /* freshly marked lost */
tcp_rate_gen(sk, delivered, lost, &now, &rs); tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
tcp_cong_control(sk, ack, delivered, flag, &rs); sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit); tcp_xmit_recovery(sk, rexmit);
return 1; return 1;
no_queue: no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */ /* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) if (flag & FLAG_DSACKING_ACK)
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
&sack_state.ack_time);
/* If this ack opens up a zero window, clear backoff. It was /* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than * being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission. * it needs to be for normal retransmission.
...@@ -3737,9 +3685,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3737,9 +3685,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* If data was DSACKed, see if we can undo a cwnd reduction. * If data was DSACKed, see if we can undo a cwnd reduction.
*/ */
if (TCP_SKB_CB(skb)->sacked) { if (TCP_SKB_CB(skb)->sacked) {
skb_mstamp_get(&sack_state.ack_time);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state); &sack_state);
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
&sack_state.ack_time);
tcp_xmit_recovery(sk, rexmit); tcp_xmit_recovery(sk, rexmit);
} }
......
...@@ -2229,7 +2229,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) ...@@ -2229,7 +2229,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
int state; int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS || if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1; timer_active = 1;
timer_expires = icsk->icsk_timeout; timer_expires = icsk->icsk_timeout;
......
...@@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk) ...@@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk)
val = tcp_metric_get(tm, TCP_METRIC_REORDERING); val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val) { if (val && tp->reordering != val) {
tcp_disable_fack(tp); tcp_disable_fack(tp);
tcp_disable_early_retrans(tp);
tp->reordering = val; tp->reordering = val;
} }
......
...@@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->sacked_out = 0; newtp->sacked_out = 0;
newtp->fackets_out = 0; newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0; newtp->tlp_high_seq = 0;
newtp->lsndtime = treq->snt_synack.stamp_jiffies; newtp->lsndtime = treq->snt_synack.stamp_jiffies;
newsk->sk_txhash = treq->txhash; newsk->sk_txhash = treq->txhash;
......
...@@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) ...@@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out += tcp_skb_pcount(skb); tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
tcp_rearm_rto(sk); tcp_rearm_rto(sk);
}
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb)); tcp_skb_pcount(skb));
...@@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) ...@@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
u32 timeout, tlp_time_stamp, rto_time_stamp; u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
return false;
/* No consecutive loss probes. */ /* No consecutive loss probes. */
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
tcp_rearm_rto(sk); tcp_rearm_rto(sk);
...@@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) ...@@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections /* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application. * in Open state, that are either limited by cwnd or application.
*/ */
if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) !tp->packets_out || !tcp_is_sack(tp) ||
icsk->icsk_ca_state != TCP_CA_Open)
return false; return false;
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
...@@ -2831,36 +2828,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) ...@@ -2831,36 +2828,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
return err; return err;
} }
/* Check if we forward retransmits are possible in the current
* window/congestion state.
*/
static bool tcp_can_forward_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
/* Forward retransmissions are possible only during Recovery. */
if (icsk->icsk_ca_state != TCP_CA_Recovery)
return false;
/* No forward retransmissions in Reno are possible. */
if (tcp_is_reno(tp))
return false;
/* Yeah, we have to make difficult choice between forward transmission
* and retransmission... Both ways have their merits...
*
* For now we do not retransmit anything, while we have some new
* segments to send. In the other cases, follow rule 3 for
* NextSeg() specified in RFC3517.
*/
if (tcp_may_send_now(sk))
return false;
return true;
}
/* This gets called after a retransmit timeout, and the initially /* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue * retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either * resending the rest of the retransmit queue, until either
...@@ -2875,24 +2842,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -2875,24 +2842,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
struct sk_buff *hole = NULL; struct sk_buff *hole = NULL;
u32 max_segs, last_lost; u32 max_segs;
int mib_idx; int mib_idx;
int fwd_rexmitting = 0;
if (!tp->packets_out) if (!tp->packets_out)
return; return;
if (!tp->lost_out)
tp->retransmit_high = tp->snd_una;
if (tp->retransmit_skb_hint) { if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint; skb = tp->retransmit_skb_hint;
last_lost = TCP_SKB_CB(skb)->end_seq;
if (after(last_lost, tp->retransmit_high))
last_lost = tp->retransmit_high;
} else { } else {
skb = tcp_write_queue_head(sk); skb = tcp_write_queue_head(sk);
last_lost = tp->snd_una;
} }
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
...@@ -2915,31 +2874,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -2915,31 +2874,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
*/ */
segs = min_t(int, segs, max_segs); segs = min_t(int, segs, max_segs);
if (fwd_rexmitting) { if (tp->retrans_out >= tp->lost_out) {
begin_fwd:
if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
break;
mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
tp->retransmit_high = last_lost;
if (!tcp_can_forward_retransmit(sk))
break; break;
/* Backtrack if necessary to non-L'ed skb */
if (hole) {
skb = hole;
hole = NULL;
}
fwd_rexmitting = 1;
goto begin_fwd;
} else if (!(sacked & TCPCB_LOST)) { } else if (!(sacked & TCPCB_LOST)) {
if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb; hole = skb;
continue; continue;
} else { } else {
last_lost = TCP_SKB_CB(skb)->end_seq;
if (icsk->icsk_ca_state != TCP_CA_Loss) if (icsk->icsk_ca_state != TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPFASTRETRANS; mib_idx = LINUX_MIB_TCPFASTRETRANS;
else else
...@@ -2960,7 +2902,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -2960,7 +2902,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_in_cwnd_reduction(sk)) if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb); tp->prr_out += tcp_skb_pcount(skb);
if (skb == tcp_write_queue_head(sk)) if (skb == tcp_write_queue_head(sk) &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, inet_csk(sk)->icsk_rto,
TCP_RTO_MAX); TCP_RTO_MAX);
......
#include <linux/tcp.h> #include <linux/tcp.h>
#include <net/tcp.h> #include <net/tcp.h>
int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
/* Marks a packet lost, if some packet sent later has been (s)acked. static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
tcp_skb_mark_lost_uncond_verify(tp, skb);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
/* Account for retransmits that are lost again */
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
}
}
static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
const struct skb_mstamp *t2,
u32 seq1, u32 seq2)
{
return skb_mstamp_after(t1, t2) ||
(t1->v64 == t2->v64 && after(seq1, seq2));
}
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
*
* Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK * The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics: * but they look at different metrics:
* *
...@@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; ...@@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
* is being more resilient to reordering by simply allowing some * is being more resilient to reordering by simply allowing some
* "settling delay", instead of tweaking the dupthresh. * "settling delay", instead of tweaking the dupthresh.
* *
* The current version is only used after recovery starts but can be * When tcp_rack_detect_loss() detects some packets are lost and we
* easily extended to detect the first loss. * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
* or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
* make us enter the CA_Recovery state.
*/ */
int tcp_rack_mark_lost(struct sock *sk) static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
u32 *reo_timeout)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
u32 reo_wnd, prior_retrans = tp->retrans_out; u32 reo_wnd;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
return 0;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
*reo_timeout = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay /* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related * RTT because reordering is often a path property and less related
* to queuing or delayed ACKs. * to queuing or delayed ACKs.
*
* TODO: measure and adapt to the observed reordering delay, and
* use a timer to retransmit like the delayed early retransmit.
*/ */
reo_wnd = 1000; reo_wnd = 1000;
if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
tcp_for_write_queue(skb, sk) { tcp_for_write_queue(skb, sk) {
...@@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk) ...@@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk)
scb->sacked & TCPCB_SACKED_ACKED) scb->sacked & TCPCB_SACKED_ACKED)
continue; continue;
if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
tp->rack.end_seq, scb->end_seq)) {
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
* A packet is lost if its elapsed time is beyond
* the recent RTT plus the reordering window.
*/
u32 elapsed = skb_mstamp_us_delta(now,
&skb->skb_mstamp);
s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
if (skb_mstamp_us_delta(&tp->rack.mstamp, if (remaining < 0) {
&skb->skb_mstamp) <= reo_wnd) tcp_rack_mark_skb_lost(sk, skb);
continue; continue;
/* skb is lost if packet sent later is sacked */
tcp_skb_mark_lost_uncond_verify(tp, skb);
if (scb->sacked & TCPCB_SACKED_RETRANS) {
scb->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPLOSTRETRANSMIT);
} }
/* Skip ones marked lost but not yet retransmitted */
if ((scb->sacked & TCPCB_LOST) &&
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
/* Record maximum wait time (+1 to avoid 0) */
*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
} else if (!(scb->sacked & TCPCB_RETRANS)) { } else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early /* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent * b/c the rest are all sent after rack_sent
...@@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk) ...@@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk)
break; break;
} }
} }
return prior_retrans - tp->retrans_out;
} }
/* Record the most recently (re)sent time among the (s)acked packets */ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
void tcp_rack_advance(struct tcp_sock *tp,
const struct skb_mstamp *xmit_time, u8 sacked)
{ {
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout;
if (!tp->rack.advanced)
return;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, now, &timeout);
if (timeout) {
timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
}
/* Record the most recently (re)sent time among the (s)acked packets
* This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
* draft-cheng-tcpm-rack-00.txt
*/
void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
const struct skb_mstamp *xmit_time,
const struct skb_mstamp *ack_time)
{
u32 rtt_us;
if (tp->rack.mstamp.v64 && if (tp->rack.mstamp.v64 &&
!skb_mstamp_after(xmit_time, &tp->rack.mstamp)) !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
end_seq, tp->rack.end_seq))
return; return;
rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
if (sacked & TCPCB_RETRANS) { if (sacked & TCPCB_RETRANS) {
struct skb_mstamp now;
/* If the sacked packet was retransmitted, it's ambiguous /* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior * whether the retransmission or the original (or the prior
* retransmission) was sacked. * retransmission) was sacked.
...@@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp, ...@@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp,
* so it's at least one RTT (i.e., retransmission is at least * so it's at least one RTT (i.e., retransmission is at least
* an RTT later). * an RTT later).
*/ */
skb_mstamp_get(&now); if (rtt_us < tcp_min_rtt(tp))
if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
return; return;
} }
tp->rack.rtt_us = rtt_us;
tp->rack.mstamp = *xmit_time; tp->rack.mstamp = *xmit_time;
tp->rack.end_seq = end_seq;
tp->rack.advanced = 1; tp->rack.advanced = 1;
} }
/* We have waited long enough to accommodate reordering. Mark the expired
* packets lost and retransmit them.
*/
void tcp_rack_reo_timeout(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct skb_mstamp now;
u32 timeout, prior_inflight;
skb_mstamp_get(&now);
prior_inflight = tcp_packets_in_flight(tp);
tcp_rack_detect_loss(sk, &now, &timeout);
if (prior_inflight != tcp_packets_in_flight(tp)) {
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
tcp_enter_recovery(sk, false);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_cwnd_reduction(sk, 1, 0);
}
tcp_xmit_retransmit_queue(sk);
}
if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
tcp_rearm_rto(sk);
}
...@@ -563,8 +563,8 @@ void tcp_write_timer_handler(struct sock *sk) ...@@ -563,8 +563,8 @@ void tcp_write_timer_handler(struct sock *sk)
event = icsk->icsk_pending; event = icsk->icsk_pending;
switch (event) { switch (event) {
case ICSK_TIME_EARLY_RETRANS: case ICSK_TIME_REO_TIMEOUT:
tcp_resume_early_retransmit(sk); tcp_rack_reo_timeout(sk);
break; break;
case ICSK_TIME_LOSS_PROBE: case ICSK_TIME_LOSS_PROBE:
tcp_send_loss_probe(sk); tcp_send_loss_probe(sk);
......
...@@ -1745,7 +1745,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) ...@@ -1745,7 +1745,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
srcp = ntohs(inet->inet_sport); srcp = ntohs(inet->inet_sport);
if (icsk->icsk_pending == ICSK_TIME_RETRANS || if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1; timer_active = 1;
timer_expires = icsk->icsk_timeout; timer_expires = icsk->icsk_timeout;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment