Commit 79ffeeb9 authored by Linus Torvalds's avatar Linus Torvalds

Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6

parents a5aac37f 6a438bbe
...@@ -78,6 +78,11 @@ inet_peer_gc_maxtime - INTEGER ...@@ -78,6 +78,11 @@ inet_peer_gc_maxtime - INTEGER
TCP variables: TCP variables:
tcp_abc - INTEGER
Controls Appropriate Byte Count defined in RFC3465. If set to
0 then does congestion avoid once per ack. 1 is conservative
value, and 2 is more agressive.
tcp_syn_retries - INTEGER tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 255. Default value will be retransmitted. Should not be higher than 255. Default value
......
...@@ -390,6 +390,7 @@ enum ...@@ -390,6 +390,7 @@ enum
NET_TCP_BIC_BETA=108, NET_TCP_BIC_BETA=108,
NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
NET_TCP_CONG_CONTROL=110, NET_TCP_CONG_CONTROL=110,
NET_TCP_ABC=111,
}; };
enum { enum {
......
...@@ -307,6 +307,21 @@ struct tcp_sock { ...@@ -307,6 +307,21 @@ struct tcp_sock {
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
struct tcp_sack_block recv_sack_cache[4];
/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
struct sk_buff *scoreboard_skb_hint;
struct sk_buff *retransmit_skb_hint;
struct sk_buff *forward_skb_hint;
struct sk_buff *fastpath_skb_hint;
int fastpath_cnt_hint;
int lost_cnt_hint;
int retransmit_cnt_hint;
int forward_cnt_hint;
__u16 advmss; /* Advertised MSS */ __u16 advmss; /* Advertised MSS */
__u16 prior_ssthresh; /* ssthresh saved at recovery start */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */
__u32 lost_out; /* Lost packets */ __u32 lost_out; /* Lost packets */
...@@ -326,6 +341,7 @@ struct tcp_sock { ...@@ -326,6 +341,7 @@ struct tcp_sock {
__u32 snd_up; /* Urgent pointer */ __u32 snd_up; /* Urgent pointer */
__u32 total_retrans; /* Total retransmits for entire connection */ __u32 total_retrans; /* Total retransmits for entire connection */
__u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */
unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_time; /* time before keep alive takes place */
unsigned int keepalive_intvl; /* time interval between keep alive probes */ unsigned int keepalive_intvl; /* time interval between keep alive probes */
......
...@@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) ...@@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk)
(skb != (struct sk_buff *)&(sk)->sk_write_queue); \ (skb != (struct sk_buff *)&(sk)->sk_write_queue); \
skb = skb->next) skb = skb->next)
/*from STCP for fast SACK Process*/
#define sk_stream_for_retrans_queue_from(skb, sk) \
for (; (skb != (sk)->sk_send_head) && \
(skb != (struct sk_buff *)&(sk)->sk_write_queue); \
skb = skb->next)
/* /*
* Default write policy as shown to user space via poll/select/SIGIO * Default write policy as shown to user space via poll/select/SIGIO
*/ */
......
...@@ -89,10 +89,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -89,10 +89,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
*/ */
#define TCP_SYN_RETRIES 5 /* number of times to retry active opening a #define TCP_SYN_RETRIES 5 /* number of times to retry active opening a
* connection: ~180sec is RFC minumum */ * connection: ~180sec is RFC minimum */
#define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a #define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a
* connection: ~180sec is RFC minumum */ * connection: ~180sec is RFC minimum */
#define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned #define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned
...@@ -180,7 +180,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -180,7 +180,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Flags in tp->nonagle */ /* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
#define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_CORK 2 /* Socket is corked */
#define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
extern struct inet_timewait_death_row tcp_death_row; extern struct inet_timewait_death_row tcp_death_row;
...@@ -218,6 +218,7 @@ extern int sysctl_tcp_low_latency; ...@@ -218,6 +218,7 @@ extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_abc;
extern atomic_t tcp_memory_allocated; extern atomic_t tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated; extern atomic_t tcp_sockets_allocated;
...@@ -551,13 +552,13 @@ extern u32 __tcp_select_window(struct sock *sk); ...@@ -551,13 +552,13 @@ extern u32 __tcp_select_window(struct sock *sk);
/* TCP timestamps are only 32-bits, this causes a slight /* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot * complication on 64-bit systems since we store a snapshot
* of jiffies in the buffer control blocks below. We decidely * of jiffies in the buffer control blocks below. We decidedly
* only use of the low 32-bits of jiffies and hide the ugly * only use of the low 32-bits of jiffies and hide the ugly
* casts with the following macro. * casts with the following macro.
*/ */
#define tcp_time_stamp ((__u32)(jiffies)) #define tcp_time_stamp ((__u32)(jiffies))
/* This is what the send packet queueing engine uses to pass /* This is what the send packet queuing engine uses to pass
* TCP per-packet control information to the transmission * TCP per-packet control information to the transmission
* code. We also store the host-order sequence numbers in * code. We also store the host-order sequence numbers in
* here too. This is 36 bytes on 32-bit architectures, * here too. This is 36 bytes on 32-bit architectures,
...@@ -597,7 +598,7 @@ struct tcp_skb_cb { ...@@ -597,7 +598,7 @@ struct tcp_skb_cb {
#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
#define TCPCB_URG 0x20 /* Urgent pointer advenced here */ #define TCPCB_URG 0x20 /* Urgent pointer advanced here */
#define TCPCB_AT_TAIL (TCPCB_URG) #define TCPCB_AT_TAIL (TCPCB_URG)
...@@ -765,6 +766,33 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) ...@@ -765,6 +766,33 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
(tp->snd_cwnd >> 2))); (tp->snd_cwnd >> 2)));
} }
/*
* Linear increase during slow start
*/
static inline void tcp_slow_start(struct tcp_sock *tp)
{
if (sysctl_tcp_abc) {
/* RFC3465: Slow Start
* TCP sender SHOULD increase cwnd by the number of
* previously unacknowledged bytes ACKed by each incoming
* acknowledgment, provided the increase is not more than L
*/
if (tp->bytes_acked < tp->mss_cache)
return;
/* We MAY increase by 2 if discovered delayed ack */
if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
}
}
tp->bytes_acked = 0;
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
}
static inline void tcp_sync_left_out(struct tcp_sock *tp) static inline void tcp_sync_left_out(struct tcp_sock *tp)
{ {
if (tp->rx_opt.sack_ok && if (tp->rx_opt.sack_ok &&
...@@ -794,6 +822,7 @@ static inline void tcp_enter_cwr(struct sock *sk) ...@@ -794,6 +822,7 @@ static inline void tcp_enter_cwr(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0; tp->prior_ssthresh = 0;
tp->bytes_acked = 0;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
__tcp_enter_cwr(sk); __tcp_enter_cwr(sk);
tcp_set_ca_state(sk, TCP_CA_CWR); tcp_set_ca_state(sk, TCP_CA_CWR);
...@@ -810,6 +839,27 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp) ...@@ -810,6 +839,27 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
return 3; return 3;
} }
/* RFC2861 Check whether we are limited by application or congestion window
* This is the inverse of cwnd check in tcp_tso_should_defer
*/
static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 left;
if (in_flight >= tp->snd_cwnd)
return 1;
if (!(sk->sk_route_caps & NETIF_F_TSO))
return 0;
left = tp->snd_cwnd - in_flight;
if (sysctl_tcp_tso_win_divisor)
return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd;
else
return left <= tcp_max_burst(tp);
}
static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
const struct sk_buff *skb) const struct sk_buff *skb)
{ {
...@@ -1157,6 +1207,15 @@ static inline void tcp_mib_init(void) ...@@ -1157,6 +1207,15 @@ static inline void tcp_mib_init(void)
TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1); TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1);
} }
/*from STCP */
static inline void clear_all_retrans_hints(struct tcp_sock *tp){
tp->lost_skb_hint = NULL;
tp->scoreboard_skb_hint = NULL;
tp->retransmit_skb_hint = NULL;
tp->forward_skb_hint = NULL;
tp->fastpath_skb_hint = NULL;
}
/* /proc */ /* /proc */
enum tcp_seq_states { enum tcp_seq_states {
TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_LISTENING,
......
...@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { ...@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_tcp_congestion_control, .proc_handler = &proc_tcp_congestion_control,
.strategy = &sysctl_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control,
}, },
{
.ctl_name = NET_TCP_ABC,
.procname = "tcp_abc",
.data = &sysctl_tcp_abc,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ .ctl_name = 0 } { .ctl_name = 0 }
}; };
......
...@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
} else if (tcp_need_reset(old_state) || } else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq && (tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
/* The last check adjusts for discrepance of Linux wrt. RFC /* The last check adjusts for discrepancy of Linux wrt. RFC
* states * states
*/ */
tcp_send_active_reset(sk, gfp_any()); tcp_send_active_reset(sk, gfp_any());
...@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0; tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff; tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0; tp->snd_cwnd_cnt = 0;
tp->bytes_acked = 0;
tcp_set_ca_state(sk, TCP_CA_Open); tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp); tcp_clear_retrans(tp);
inet_csk_delack_init(sk); inet_csk_delack_init(sk);
......
...@@ -217,14 +217,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, ...@@ -217,14 +217,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
bictcp_low_utilization(sk, data_acked); bictcp_low_utilization(sk, data_acked);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
/* In "safe" area, increase. */ tcp_slow_start(tp);
if (tp->snd_cwnd < tp->snd_cwnd_clamp) else {
tp->snd_cwnd++;
} else {
bictcp_update(ca, tp->snd_cwnd); bictcp_update(ca, tp->snd_cwnd);
/* In dangerous area, increase slowly. /* In dangerous area, increase slowly.
......
...@@ -186,17 +186,25 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, ...@@ -186,17 +186,25 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* In "safe" area, increase. */ /* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp);
/* In dangerous area, increase slowly. */
else if (sysctl_tcp_abc) {
/* RFC3465: Apppriate Byte Count
* increase once for each full cwnd acked
*/
if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
if (tp->snd_cwnd < tp->snd_cwnd_clamp) if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++; tp->snd_cwnd++;
}
} else { } else {
/* In dangerous area, increase slowly. /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp) if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++; tp->snd_cwnd++;
......
...@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) ...@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
} }
static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
u32 in_flight, int good) u32 in_flight, u32 pkts_acked)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk); struct hstcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
if (tp->snd_cwnd < tp->snd_cwnd_clamp) tcp_slow_start(tp);
tp->snd_cwnd++; else {
} else {
/* Update AIMD parameters */ /* Update AIMD parameters */
if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
......
...@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ...@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk); struct htcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
/* In "safe" area, increase. */ tcp_slow_start(tp);
if (tp->snd_cwnd < tp->snd_cwnd_clamp) else {
tp->snd_cwnd++;
} else {
measure_rtt(sk); measure_rtt(sk);
/* keep track of number of round-trip times since last backoff event */ /* keep track of number of round-trip times since last backoff event */
......
...@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ...@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
ca->minrtt = tp->srtt; ca->minrtt = tp->srtt;
} }
if (!tcp_is_cwnd_limited(sk, in_flight))
return;
if (!ca->hybla_en) if (!ca->hybla_en)
return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
if (in_flight < tp->snd_cwnd)
return;
if (ca->rho == 0) if (ca->rho == 0)
hybla_recalc_param(sk); hybla_recalc_param(sk);
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
* Andi Kleen : Moved open_request checking here * Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests. * and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes. * Andi Kleen : Better prune_queue, and other fixes.
* Andrey Savochkin: Fix RTT measurements in the presnce of * Andrey Savochkin: Fix RTT measurements in the presence of
* timestamps. * timestamps.
* Andrey Savochkin: Check sequence numbers correctly when * Andrey Savochkin: Check sequence numbers correctly when
* removing SACKs due to in sequence incoming * removing SACKs due to in sequence incoming
...@@ -89,6 +89,7 @@ int sysctl_tcp_frto; ...@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
int sysctl_tcp_nometrics_save; int sysctl_tcp_nometrics_save;
int sysctl_tcp_moderate_rcvbuf = 1; int sysctl_tcp_moderate_rcvbuf = 1;
int sysctl_tcp_abc = 1;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
...@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) ...@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
* of receiver window. Check #2. * of receiver window. Check #2.
* *
* The scheme does not work when sender sends good segments opening * The scheme does not work when sender sends good segments opening
* window and then starts to feed us spagetti. But it should work * window and then starts to feed us spaghetti. But it should work
* in common situations. Otherwise, we have to rely on queue collapsing. * in common situations. Otherwise, we have to rely on queue collapsing.
*/ */
...@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, ...@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
{ {
/* Optimize this! */ /* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize)/2; int truesize = tcp_win_from_space(skb->truesize)/2;
int window = tcp_full_space(sk)/2; int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
while (tp->rcv_ssthresh <= window) { while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len) if (truesize <= skb->len)
...@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) ...@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
/* Try to select rcvbuf so that 4 mss-sized segments /* Try to select rcvbuf so that 4 mss-sized segments
* will fit to window and correspoding skbs will fit to our rcvbuf. * will fit to window and corresponding skbs will fit to our rcvbuf.
* (was 3; 4 is minimum to allow fast retransmit to work.) * (was 3; 4 is minimum to allow fast retransmit to work.)
*/ */
while (tcp_win_from_space(rcvmem) < tp->advmss) while (tcp_win_from_space(rcvmem) < tp->advmss)
...@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) ...@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
} }
/* 4. Try to fixup all. It is made iimediately after connection enters /* 4. Try to fixup all. It is made immediately after connection enters
* established state. * established state.
*/ */
static void tcp_init_buffer_space(struct sock *sk) static void tcp_init_buffer_space(struct sock *sk)
...@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) ...@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
int ofo_win = 0;
icsk->icsk_ack.quick = 0; icsk->icsk_ack.quick = 0;
skb_queue_walk(&tp->out_of_order_queue, skb) {
ofo_win += skb->len;
}
/* If overcommit is due to out of order segments,
* do not clamp window. Try to expand rcvbuf instead.
*/
if (ofo_win) {
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_memory_pressure && !tcp_memory_pressure &&
atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]); sysctl_tcp_rmem[2]);
} }
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
app_win += ofo_win;
if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
app_win >>= 1;
if (app_win > icsk->icsk_ack.rcv_mss)
app_win -= icsk->icsk_ack.rcv_mss;
app_win = max(app_win, 2U*tp->advmss);
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
}
} }
/* Receiver "autotuning" code. /* Receiver "autotuning" code.
...@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) ...@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* are stalled on filesystem I/O. * are stalled on filesystem I/O.
* *
* Also, since we are only going for a minimum in the * Also, since we are only going for a minimum in the
* non-timestamp case, we do not smoothe things out * non-timestamp case, we do not smoother things out
* else with timestamps disabled convergance takes too * else with timestamps disabled convergence takes too
* long. * long.
*/ */
if (!win_dep) { if (!win_dep) {
...@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) ...@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
} else if (m < new_sample) } else if (m < new_sample)
new_sample = m << 3; new_sample = m << 3;
} else { } else {
/* No previous mesaure. */ /* No previous measure. */
new_sample = m << 3; new_sample = m << 3;
} }
...@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ ...@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
if (icsk->icsk_ack.ato > icsk->icsk_rto) if (icsk->icsk_ack.ato > icsk->icsk_rto)
icsk->icsk_ack.ato = icsk->icsk_rto; icsk->icsk_ack.ato = icsk->icsk_rto;
} else if (m > icsk->icsk_rto) { } else if (m > icsk->icsk_rto) {
/* Too long gap. Apparently sender falled to /* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly. * restart window, so that we send ACKs quickly.
*/ */
tcp_incr_quickack(sk); tcp_incr_quickack(sk);
...@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ ...@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break * To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics * it up into three procedures. -- erics
*/ */
static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
long m = mrtt; /* RTT */ long m = mrtt; /* RTT */
/* The following amusing code comes from Jacobson's /* The following amusing code comes from Jacobson's
...@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) ...@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
* *
* Funny. This algorithm seems to be very broken. * Funny. This algorithm seems to be very broken.
* These formulae increase RTO, when it should be decreased, increase * These formulae increase RTO, when it should be decreased, increase
* too slowly, when it should be incresed fastly, decrease too fastly * too slowly, when it should be increased fastly, decrease too fastly
* etc. I guess in BSD RTO takes ONE value, so that it is absolutely * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
* does not matter how to _calculate_ it. Seems, it was trap * does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8) * that VJ failed to avoid. 8)
...@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) ...@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
tp->rtt_seq = tp->snd_nxt; tp->rtt_seq = tp->snd_nxt;
} }
if (icsk->icsk_ca_ops->rtt_sample)
icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
} }
/* Calculate rto without backoff. This is the second half of Van Jacobson's /* Calculate rto without backoff. This is the second half of Van Jacobson's
...@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) ...@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
* at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
* to do with delayed acks, because at cwnd>2 true delack timeout * to do with delayed acks, because at cwnd>2 true delack timeout
* is invisible. Actually, Linux-2.4 also generates erratic * is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some curcumstances. * ACKs in some circumstances.
*/ */
inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
/* 2. Fixups made earlier cannot be right. /* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them, * If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced * all the algo is pure shit and should be replaced
* with correct one. It is exaclty, which we pretend to do. * with correct one. It is exactly, which we pretend to do.
*/ */
} }
...@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) ...@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
* to make it more realistic. * to make it more realistic.
* *
* A bit of theory. RTT is time passed after "normal" sized packet * A bit of theory. RTT is time passed after "normal" sized packet
* is sent until it is ACKed. In normal curcumstances sending small * is sent until it is ACKed. In normal circumstances sending small
* packets force peer to delay ACKs and calculation is correct too. * packets force peer to delay ACKs and calculation is correct too.
* The algorithm is adaptive and, provided we follow specs, it * The algorithm is adaptive and, provided we follow specs, it
* NEVER underestimate RTT. BUT! If peer tries to make some clever * NEVER underestimate RTT. BUT! If peer tries to make some clever
...@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ ...@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int prior_fackets; int prior_fackets;
u32 lost_retrans = 0; u32 lost_retrans = 0;
int flag = 0; int flag = 0;
int dup_sack = 0;
int i; int i;
if (!tp->sacked_out) if (!tp->sacked_out)
tp->fackets_out = 0; tp->fackets_out = 0;
prior_fackets = tp->fackets_out; prior_fackets = tp->fackets_out;
for (i=0; i<num_sacks; i++, sp++) { /* SACK fastpath:
struct sk_buff *skb; * if the only SACK change is the increase of the end_seq of
__u32 start_seq = ntohl(sp->start_seq); * the first block then only apply that SACK block
__u32 end_seq = ntohl(sp->end_seq); * and use retrans queue hinting otherwise slowpath */
int fack_count = 0; flag = 1;
int dup_sack = 0; for (i = 0; i< num_sacks; i++) {
__u32 start_seq = ntohl(sp[i].start_seq);
__u32 end_seq = ntohl(sp[i].end_seq);
if (i == 0){
if (tp->recv_sack_cache[i].start_seq != start_seq)
flag = 0;
} else {
if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
(tp->recv_sack_cache[i].end_seq != end_seq))
flag = 0;
}
tp->recv_sack_cache[i].start_seq = start_seq;
tp->recv_sack_cache[i].end_seq = end_seq;
/* Check for D-SACK. */ /* Check for D-SACK. */
if (i == 0) { if (i == 0) {
...@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ ...@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (before(ack, prior_snd_una - tp->max_window)) if (before(ack, prior_snd_una - tp->max_window))
return 0; return 0;
} }
}
if (flag)
num_sacks = 1;
else {
int j;
tp->fastpath_skb_hint = NULL;
/* order SACK blocks to allow in order walk of the retrans queue */
for (i = num_sacks-1; i > 0; i--) {
for (j = 0; j < i; j++){
if (after(ntohl(sp[j].start_seq),
ntohl(sp[j+1].start_seq))){
sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
}
}
}
}
/* clear flag as used for different purpose in following code */
flag = 0;
for (i=0; i<num_sacks; i++, sp++) {
struct sk_buff *skb;
__u32 start_seq = ntohl(sp->start_seq);
__u32 end_seq = ntohl(sp->end_seq);
int fack_count;
/* Use SACK fastpath hint if valid */
if (tp->fastpath_skb_hint) {
skb = tp->fastpath_skb_hint;
fack_count = tp->fastpath_cnt_hint;
} else {
skb = sk->sk_write_queue.next;
fack_count = 0;
}
/* Event "B" in the comment above. */ /* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq)) if (after(end_seq, tp->high_seq))
flag |= FLAG_DATA_LOST; flag |= FLAG_DATA_LOST;
sk_stream_for_retrans_queue(skb, sk) { sk_stream_for_retrans_queue_from(skb, sk) {
int in_sack, pcount; int in_sack, pcount;
u8 sacked; u8 sacked;
tp->fastpath_skb_hint = skb;
tp->fastpath_cnt_hint = fack_count;
/* The retransmission queue is always in order, so /* The retransmission queue is always in order, so
* we can short-circuit the walk early. * we can short-circuit the walk early.
*/ */
...@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ ...@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= tcp_skb_pcount(skb); tp->lost_out -= tcp_skb_pcount(skb);
tp->retrans_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
} }
} else { } else {
/* New sack for not retransmitted frame, /* New sack for not retransmitted frame,
...@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ ...@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (sacked & TCPCB_LOST) { if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
tp->lost_out -= tcp_skb_pcount(skb); tp->lost_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
} }
} }
...@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ ...@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb);
tp->retransmit_skb_hint = NULL;
} }
} }
} }
...@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ ...@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb); tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
...@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) ...@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Loss); tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark; tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp); TCP_ECN_queue_cwr(tp);
clear_all_retrans_hints(tp);
} }
void tcp_clear_retrans(struct tcp_sock *tp) void tcp_clear_retrans(struct tcp_sock *tp)
...@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) ...@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_cnt = 0; tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp; tp->snd_cwnd_stamp = tcp_time_stamp;
tp->bytes_acked = 0;
tcp_clear_retrans(tp); tcp_clear_retrans(tp);
/* Push undo marker, if it was plain RTO and nothing /* Push undo marker, if it was plain RTO and nothing
...@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) ...@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state(sk, TCP_CA_Loss); tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt; tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp); TCP_ECN_queue_cwr(tp);
clear_all_retrans_hints(tp);
} }
static int tcp_check_sack_reneging(struct sock *sk) static int tcp_check_sack_reneging(struct sock *sk)
...@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, ...@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
int packets, u32 high_seq) int packets, u32 high_seq)
{ {
struct sk_buff *skb; struct sk_buff *skb;
int cnt = packets; int cnt;
BUG_TRAP(cnt <= tp->packets_out); BUG_TRAP(packets <= tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
} else {
skb = sk->sk_write_queue.next;
cnt = 0;
}
sk_stream_for_retrans_queue(skb, sk) { sk_stream_for_retrans_queue_from(skb, sk) {
cnt -= tcp_skb_pcount(skb); /* TODO: do this better */
if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) /* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
cnt += tcp_skb_pcount(skb);
if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break; break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb); tp->lost_out += tcp_skb_pcount(skb);
/* clear xmit_retransmit_queue hints
* if this is beyond hint */
if(tp->retransmit_skb_hint != NULL &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
tp->retransmit_skb_hint = NULL;
}
} }
} }
tcp_sync_left_out(tp); tcp_sync_left_out(tp);
...@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) ...@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
if (tcp_head_timedout(sk, tp)) { if (tcp_head_timedout(sk, tp)) {
struct sk_buff *skb; struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) { skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
if (tcp_skb_timedout(sk, skb) && : sk->sk_write_queue.next;
!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
sk_stream_for_retrans_queue_from(skb, sk) {
if (!tcp_skb_timedout(sk, skb))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb); tp->lost_out += tcp_skb_pcount(skb);
/* clear xmit_retrans hint */
if (tp->retransmit_skb_hint &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = NULL;
} }
} }
tp->scoreboard_skb_hint = skb;
tcp_sync_left_out(tp); tcp_sync_left_out(tp);
} }
} }
...@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) ...@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
} }
tcp_moderate_cwnd(tp); tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp; tp->snd_cwnd_stamp = tcp_time_stamp;
/* There is something screwy going on with the retrans hints after
an undo */
clear_all_retrans_hints(tp);
} }
static inline int tcp_may_undo(struct tcp_sock *tp) static inline int tcp_may_undo(struct tcp_sock *tp)
...@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) ...@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
sk_stream_for_retrans_queue(skb, sk) { sk_stream_for_retrans_queue(skb, sk) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
} }
clear_all_retrans_hints(tp);
DBGUNDO(sk, tp, "partial loss"); DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0; tp->lost_out = 0;
tp->left_out = tp->sacked_out; tp->left_out = tp->sacked_out;
...@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, ...@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
TCP_ECN_queue_cwr(tp); TCP_ECN_queue_cwr(tp);
} }
tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0; tp->snd_cwnd_cnt = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery); tcp_set_ca_state(sk, TCP_CA_Recovery);
} }
...@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, ...@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
} }
/* Read draft-ietf-tcplw-high-performance before mucking /* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Superceeds RFC1323) * with this code. (Supersedes RFC1323)
*/ */
static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{ {
/* RTTM Rule: A TSecr value received in a segment is used to /* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment * update the averaged RTT measurement only if the segment
...@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) ...@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru> * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
* *
* Changed: reset backoff as soon as we see the first valid sample. * Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overstimated rto. With timestamps * If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1 * samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed * increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments * answer arrives rto becomes 120 seconds! If at least one of segments
...@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) ...@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
*/ */
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
tcp_rtt_estimator(sk, seq_rtt, usrtt); tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk); tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0; inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk); tcp_bound_rto(sk);
} }
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{ {
/* We don't have a timestamp. Can only use /* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine * packets that are not retransmitted to determine
...@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag ...@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
if (flag & FLAG_RETRANS_DATA_ACKED) if (flag & FLAG_RETRANS_DATA_ACKED)
return; return;
tcp_rtt_estimator(sk, seq_rtt, usrtt); tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk); tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0; inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk); tcp_bound_rto(sk);
} }
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
const s32 seq_rtt, u32 *usrtt) const s32 seq_rtt)
{ {
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tcp_ack_saw_tstamp(sk, usrtt, flag); tcp_ack_saw_tstamp(sk, flag);
else if (seq_rtt >= 0) else if (seq_rtt >= 0)
tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); tcp_ack_no_tstamp(sk, seq_rtt, flag);
} }
static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
...@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, ...@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
return acked; return acked;
} }
static inline u32 tcp_usrtt(const struct sk_buff *skb)
{
struct timeval tv, now;
do_gettimeofday(&now);
skb_get_timestamp(skb, &tv);
return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
}
/* Remove acknowledged frames from the retransmission queue. */ /* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb; struct sk_buff *skb;
__u32 now = tcp_time_stamp; __u32 now = tcp_time_stamp;
int acked = 0; int acked = 0;
__s32 seq_rtt = -1; __s32 seq_rtt = -1;
struct timeval usnow;
u32 pkts_acked = 0; u32 pkts_acked = 0;
void (*rtt_sample)(struct sock *sk, u32 usrtt)
if (seq_usrtt) = icsk->icsk_ca_ops->rtt_sample;
do_gettimeofday(&usnow);
while ((skb = skb_peek(&sk->sk_write_queue)) && while ((skb = skb_peek(&sk->sk_write_queue)) &&
skb != sk->sk_send_head) { skb != sk->sk_send_head) {
...@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt ...@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
tp->retrans_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb);
acked |= FLAG_RETRANS_DATA_ACKED; acked |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1; seq_rtt = -1;
} else if (seq_rtt < 0) } else if (seq_rtt < 0) {
seq_rtt = now - scb->when; seq_rtt = now - scb->when;
if (seq_usrtt) { if (rtt_sample)
struct timeval tv; (*rtt_sample)(sk, tcp_usrtt(skb));
skb_get_timestamp(skb, &tv);
*seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
+ (usnow.tv_usec - tv.tv_usec);
} }
if (sacked & TCPCB_SACKED_ACKED) if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb); tp->sacked_out -= tcp_skb_pcount(skb);
if (sacked & TCPCB_LOST) if (sacked & TCPCB_LOST)
...@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt ...@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
!before(scb->end_seq, tp->snd_up)) !before(scb->end_seq, tp->snd_up))
tp->urg_mode = 0; tp->urg_mode = 0;
} }
} else if (seq_rtt < 0) } else if (seq_rtt < 0) {
seq_rtt = now - scb->when; seq_rtt = now - scb->when;
if (rtt_sample)
(*rtt_sample)(sk, tcp_usrtt(skb));
}
tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb); tcp_packets_out_dec(tp, skb);
__skb_unlink(skb, &sk->sk_write_queue); __skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb); sk_stream_free_skb(sk, skb);
clear_all_retrans_hints(tp);
} }
if (acked&FLAG_ACKED) { if (acked&FLAG_ACKED) {
const struct inet_connection_sock *icsk = inet_csk(sk); tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
tcp_ack_packets_out(sk, tp); tcp_ack_packets_out(sk, tp);
if (icsk->icsk_ca_ops->pkts_acked) if (icsk->icsk_ca_ops->pkts_acked)
...@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) ...@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
} }
/* F-RTO affects on two new ACKs following RTO. /* F-RTO affects on two new ACKs following RTO.
* At latest on third ACK the TCP behavor is back to normal. * At latest on third ACK the TCP behavior is back to normal.
*/ */
tp->frto_counter = (tp->frto_counter + 1) % 3; tp->frto_counter = (tp->frto_counter + 1) % 3;
} }
...@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) ...@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight; u32 prior_in_flight;
s32 seq_rtt; s32 seq_rtt;
s32 seq_usrtt = 0;
int prior_packets; int prior_packets;
/* If the ack is newer than sent or older than previous acks /* If the ack is newer than sent or older than previous acks
...@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) ...@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (before(ack, prior_snd_una)) if (before(ack, prior_snd_una))
goto old_ack; goto old_ack;
if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
tp->bytes_acked += ack - prior_snd_una;
if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance. /* Window is constant, pure forward advance.
* No more checks are required. * No more checks are required.
...@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) ...@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
prior_in_flight = tcp_packets_in_flight(tp); prior_in_flight = tcp_packets_in_flight(tp);
/* See if we can take anything off of the retransmit queue. */ /* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, &seq_rtt, flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
if (tp->frto_counter) if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una); tcp_process_frto(sk, prior_snd_una);
if (tcp_ack_is_dubious(sk, flag)) { if (tcp_ack_is_dubious(sk, flag)) {
/* Advanve CWND, if state allows this. */ /* Advance CWND, if state allows this. */
if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
...@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, ...@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
{ {
struct sk_buff *skb; struct sk_buff *skb;
/* First, check that queue is collapsable and find /* First, check that queue is collapsible and find
* the point where collapsing can be useful. */ * the point where collapsing can be useful. */
for (skb = head; skb != tail; ) { for (skb = head; skb != tail; ) {
/* No new bits? It is possible on ofo queue. */ /* No new bits? It is possible on ofo queue. */
...@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) ...@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
/* /*
* This routine is only called when we have urgent data * This routine is only called when we have urgent data
* signalled. Its the 'slow' part of tcp_urg. It could be * signaled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one * moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as * place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961. * BSD still doesn't use the correction from RFC961.
...@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) ...@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* urgent. To do this requires some care. We cannot just ignore * urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again * tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives * as data, nor can we alter copied_seq until this data arrives
* or we break the sematics of SIOCATMARK (and thus sockatmark()) * or we break the semantics of SIOCATMARK (and thus sockatmark())
* *
* NOTE. Double Dutch. Rendering to plain English: author of comment * NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
...@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, ...@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rx_opt.saw_tstamp = 0; tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd /* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_predition is to be made * if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2 * 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive * turn it off (when there are holes in the receive
...@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/ */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!tp->srtt) !tp->srtt)
tcp_ack_saw_tstamp(sk, NULL, 0); tcp_ack_saw_tstamp(sk, 0);
if (tp->rx_opt.tstamp_ok) if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
...@@ -4372,6 +4471,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -4372,6 +4471,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_ecn);
EXPORT_SYMBOL(sysctl_tcp_reordering); EXPORT_SYMBOL(sysctl_tcp_reordering);
EXPORT_SYMBOL(sysctl_tcp_abc);
EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_parse_options);
EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_rcv_state_process);
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
* request_sock handling and moved * request_sock handling and moved
* most of it into the af independent code. * most of it into the af independent code.
* Added tail drop and some other bugfixes. * Added tail drop and some other bugfixes.
* Added new listen sematics. * Added new listen semantics.
* Mike McLagan : Routing by source * Mike McLagan : Routing by source
* Juan Jose Ciarlante: ip_dynaddr bits * Juan Jose Ciarlante: ip_dynaddr bits
* Andi Kleen: various fixes. * Andi Kleen: various fixes.
...@@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
/* An explanation is required here, I think. /* An explanation is required here, I think.
* Packet length and doff are validated by header prediction, * Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is elimineted. * provided case of th->doff==0 is eliminated.
* So, we defer the checks. */ * So, we defer the checks. */
if ((skb->ip_summed != CHECKSUM_UNNECESSARY && if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
tcp_v4_checksum_init(skb))) tcp_v4_checksum_init(skb)))
......
...@@ -158,7 +158,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -158,7 +158,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
/* I am shamed, but failed to make it more elegant. /* I am shamed, but failed to make it more elegant.
* Yes, it is direct reference to IP, which is impossible * Yes, it is direct reference to IP, which is impossible
* to generalize to IPv6. Taking into account that IPv6 * to generalize to IPv6. Taking into account that IPv6
* do not undertsnad recycling in any case, it not * do not understand recycling in any case, it not
* a big problem in practice. --ANK */ * a big problem in practice. --ANK */
if (tw->tw_family == AF_INET && if (tw->tw_family == AF_INET &&
tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
...@@ -194,7 +194,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -194,7 +194,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
/* In window segment, it may be only reset or bare ack. */ /* In window segment, it may be only reset or bare ack. */
if (th->rst) { if (th->rst) {
/* This is TIME_WAIT assasination, in two flavors. /* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this * Oh well... nobody has a sufficient solution to this
* protocol bug yet. * protocol bug yet.
*/ */
...@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, ...@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/ */
newtp->snd_cwnd = 2; newtp->snd_cwnd = 2;
newtp->snd_cwnd_cnt = 0; newtp->snd_cwnd_cnt = 0;
newtp->bytes_acked = 0;
newtp->frto_counter = 0; newtp->frto_counter = 0;
newtp->frto_highmark = 0; newtp->frto_highmark = 0;
...@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, ...@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* RFC793 page 36: "If the connection is in any non-synchronized state ... /* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet * and the incoming segment acknowledges something not yet
* sent (the segment carries an unaccaptable ACK) ... * sent (the segment carries an unacceptable ACK) ...
* a reset is sent." * a reset is sent."
* *
* Invalid ACK: reset will be sent by listening socket * Invalid ACK: reset will be sent by listening socket
......
...@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss ...@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
u16 flags; u16 flags;
BUG_ON(len > skb->len); BUG_ON(len > skb->len);
clear_all_retrans_hints(tp);
nsize = skb_headlen(skb) - len; nsize = skb_headlen(skb) - len;
if (nsize < 0) if (nsize < 0)
nsize = 0; nsize = 0;
...@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) ...@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
for TCP options, but includes only bare TCP header. for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup. tp->rx_opt.mss_clamp is mss negotiated at connection setup.
It is minumum of user_mss and mss received with SYN. It is minimum of user_mss and mss received with SYN.
It also does not include TCP options. It also does not include TCP options.
tp->pmtu_cookie is last pmtu, seen by this function. tp->pmtu_cookie is last pmtu, seen by this function.
...@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk) ...@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
/* MSS for the peer's data. Previous verions used mss_clamp /* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses * here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct * of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss * but may be worse for the performance because of rcv_mss
...@@ -1260,6 +1262,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m ...@@ -1260,6 +1262,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
BUG_ON(tcp_skb_pcount(skb) != 1 || BUG_ON(tcp_skb_pcount(skb) != 1 ||
tcp_skb_pcount(next_skb) != 1); tcp_skb_pcount(next_skb) != 1);
/* changing transmit queue under us so clear hints */
clear_all_retrans_hints(tp);
/* Ok. We will be able to collapse the packet. */ /* Ok. We will be able to collapse the packet. */
__skb_unlink(next_skb, &sk->sk_write_queue); __skb_unlink(next_skb, &sk->sk_write_queue);
...@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) ...@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk)
} }
} }
clear_all_retrans_hints(tp);
if (!lost) if (!lost)
return; return;
...@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int err; int err;
/* Do not sent more than we queued. 1/4 is reserved for possible /* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: frgagmentation, tunneling, mangling etc. * copying overhead: fragmentation, tunneling, mangling etc.
*/ */
if (atomic_read(&sk->sk_wmem_alloc) > if (atomic_read(&sk->sk_wmem_alloc) >
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
...@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
int packet_cnt = tp->lost_out; int packet_cnt;
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
packet_cnt = tp->retransmit_cnt_hint;
}else{
skb = sk->sk_write_queue.next;
packet_cnt = 0;
}
/* First pass: retransmit lost packets. */ /* First pass: retransmit lost packets. */
if (packet_cnt) { if (tp->lost_out) {
sk_stream_for_retrans_queue(skb, sk) { sk_stream_for_retrans_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked; __u8 sacked = TCP_SKB_CB(skb)->sacked;
/* we could do better than to assign each time */
tp->retransmit_skb_hint = skb;
tp->retransmit_cnt_hint = packet_cnt;
/* Assume this retransmit will generate /* Assume this retransmit will generate
* only one packet for congestion window * only one packet for congestion window
* calculation purposes. This works because * calculation purposes. This works because
...@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return; return;
if (sacked&TCPCB_LOST) { if (sacked & TCPCB_LOST) {
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
if (tcp_retransmit_skb(sk, skb)) if (tcp_retransmit_skb(sk, skb)) {
tp->retransmit_skb_hint = NULL;
return; return;
}
if (icsk->icsk_ca_state != TCP_CA_Loss) if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else else
...@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
TCP_RTO_MAX); TCP_RTO_MAX);
} }
packet_cnt -= tcp_skb_pcount(skb); packet_cnt += tcp_skb_pcount(skb);
if (packet_cnt <= 0) if (packet_cnt >= tp->lost_out)
break; break;
} }
} }
...@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_may_send_now(sk, tp)) if (tcp_may_send_now(sk, tp))
return; return;
if (tp->forward_skb_hint) {
skb = tp->forward_skb_hint;
packet_cnt = tp->forward_cnt_hint;
} else{
skb = sk->sk_write_queue.next;
packet_cnt = 0; packet_cnt = 0;
}
sk_stream_for_retrans_queue_from(skb, sk) {
tp->forward_cnt_hint = packet_cnt;
tp->forward_skb_hint = skb;
sk_stream_for_retrans_queue(skb, sk) {
/* Similar to the retransmit loop above we /* Similar to the retransmit loop above we
* can pretend that the retransmitted SKB * can pretend that the retransmitted SKB
* we send out here will be composed of one * we send out here will be composed of one
...@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
continue; continue;
/* Ok, retransmit it. */ /* Ok, retransmit it. */
if (tcp_retransmit_skb(sk, skb)) if (tcp_retransmit_skb(sk, skb)) {
tp->forward_skb_hint = NULL;
break; break;
}
if (skb == skb_peek(&sk->sk_write_queue)) if (skb == skb_peek(&sk->sk_write_queue))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
...@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect); ...@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect);
EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_make_synack);
EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(tcp_sync_mss); EXPORT_SYMBOL(tcp_sync_mss);
EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
...@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ...@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag) u32 in_flight, int flag)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (in_flight < tp->snd_cwnd)
if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
tp->snd_cwnd++; tcp_slow_start(tp);
} else { else {
tp->snd_cwnd_cnt++; tp->snd_cwnd_cnt++;
if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++; tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0; tp->snd_cwnd_cnt = 0;
} }
} }
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
tp->snd_cwnd_stamp = tcp_time_stamp;
} }
static u32 tcp_scalable_ssthresh(struct sock *sk) static u32 tcp_scalable_ssthresh(struct sock *sk)
......
...@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) ...@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
* to prevent DoS attacks. It is called when a retransmission timeout * to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket. * or zero probe timeout occurs on orphaned socket.
* *
* Criterium is still not confirmed experimentally and may change. * Criteria is still not confirmed experimentally and may change.
* We kill the socket, if: * We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured * 1. If number of orphaned sockets exceeds an administratively configured
* limit. * limit.
...@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) ...@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
hole detection. :-( hole detection. :-(
It is place to make it. It is not made. I do not want It is place to make it. It is not made. I do not want
to make it. It is disguisting. It does not work in any to make it. It is disgusting. It does not work in any
case. Let me to cite the same draft, which requires for case. Let me to cite the same draft, which requires for
us to implement this: us to implement this:
......
...@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas /* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno. * calculation, so we'll behave like Reno.
*/ */
if (tp->snd_cwnd > tp->snd_ssthresh) tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt);
tp->snd_cwnd++;
} else { } else {
u32 rtt, target_cwnd, diff; u32 rtt, target_cwnd, diff;
...@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
*/ */
diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
if (tp->snd_cwnd < tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */ /* Slow start. */
if (diff > gamma) { if (diff > gamma) {
/* Going too fast. Time to slow down /* Going too fast. Time to slow down
...@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
V_PARAM_SHIFT)+1); V_PARAM_SHIFT)+1);
} }
tcp_slow_start(tp);
} else { } else {
/* Congestion avoidance. */ /* Congestion avoidance. */
u32 next_snd_cwnd; u32 next_snd_cwnd;
...@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
else if (next_snd_cwnd < tp->snd_cwnd) else if (next_snd_cwnd < tp->snd_cwnd)
tp->snd_cwnd--; tp->snd_cwnd--;
} }
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
tp->snd_cwnd = tp->snd_cwnd_clamp;
}
} }
/* Wipe the slate clean for the next RTT. */ /* Wipe the slate clean for the next RTT. */
vegas->cntRTT = 0; vegas->cntRTT = 0;
vegas->minRTT = 0x7fffffff; vegas->minRTT = 0x7fffffff;
}
/* The following code is executed for every ack we receive,
* except for conditions checked in should_advance_cwnd()
* before the call to tcp_cong_avoid(). Mainly this means that
* we only execute this code if the ack actually acked some
* data.
*/
/* If we are in slow start, increase our cwnd in response to this ACK.
* (If we are not in slow start then we are in congestion avoidance,
* and adjust our congestion window only once per RTT. See the code
* above.)
*/
if (tp->snd_cwnd <= tp->snd_ssthresh)
tp->snd_cwnd++;
/* to keep cwnd from growing without bound */
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
/* Make sure that we are never so timid as to reduce our cwnd below
* 2 MSS.
*
* Going below 2 MSS would risk huge delayed ACKs from our receiver.
*/
tp->snd_cwnd = max(tp->snd_cwnd, 2U);
} }
/* Extract info for Tcp socket info provided via netlink. */ /* Extract info for Tcp socket info provided via netlink. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment