Commit 31770e34 authored by Florian Westphal's avatar Florian Westphal Committed by David S. Miller

tcp: Revert "tcp: remove header prediction"

This reverts commit 45f119bf.

Eric Dumazet says:
  We found at Google a significant regression caused by
  45f119bf tcp: remove header prediction

  In typical RPC  (TCP_RR), when a TCP socket receives data, we now call
  tcp_ack() while we used to not call it.

  This touches enough cache lines to cause a slowdown.

so problem does not seem to be HP removal itself but the tcp_ack()
call.  Therefore, it might be possible to remove HP after all, provided
one finds a way to elide tcp_ack for most cases.
Reported-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent c1d2b4c3
...@@ -147,6 +147,12 @@ struct tcp_sock { ...@@ -147,6 +147,12 @@ struct tcp_sock {
u16 tcp_header_len; /* Bytes of tcp header to send */ u16 tcp_header_len; /* Bytes of tcp header to send */
u16 gso_segs; /* Max number of segs per GSO packet */ u16 gso_segs; /* Max number of segs per GSO packet */
/*
* Header prediction flags
* 0x5?10 << 16 + snd_wnd in net byte order
*/
__be32 pred_flags;
/* /*
* RFC793 variables by their proper names. This means you can * RFC793 variables by their proper names. This means you can
* read the code and the spec side by side (and laugh ...) * read the code and the spec side by side (and laugh ...)
......
...@@ -634,6 +634,29 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) ...@@ -634,6 +634,29 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
} }
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}
static inline void tcp_fast_path_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
tp->rcv_wnd &&
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
!tp->urg_data)
tcp_fast_path_on(tp);
}
/* Compute the actual rto_min value */ /* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk) static inline u32 tcp_rto_min(struct sock *sk)
{ {
......
...@@ -184,7 +184,9 @@ enum ...@@ -184,7 +184,9 @@ enum
LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */
LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */ LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */
LINUX_MIB_LISTENDROPS, /* ListenDrops */ LINUX_MIB_LISTENDROPS, /* ListenDrops */
LINUX_MIB_TCPHPHITS, /* TCPHPHits */
LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */ LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */
LINUX_MIB_TCPHPACKS, /* TCPHPAcks */
LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */
LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */
LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */
......
...@@ -206,7 +206,9 @@ static const struct snmp_mib snmp4_net_list[] = { ...@@ -206,7 +206,9 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
......
...@@ -1963,8 +1963,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, ...@@ -1963,8 +1963,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
tcp_rcv_space_adjust(sk); tcp_rcv_space_adjust(sk);
skip_copy: skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
tp->urg_data = 0; tp->urg_data = 0;
tcp_fast_path_check(sk);
}
if (used + offset < skb->len) if (used + offset < skb->len)
continue; continue;
......
...@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; ...@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
...@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ...@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
if (tp->snd_wnd != nwin) { if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin; tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tp->pred_flags = 0;
tcp_fast_path_check(sk);
if (tcp_send_head(sk)) if (tcp_send_head(sk))
tcp_slow_start_after_idle_check(sk); tcp_slow_start_after_idle_check(sk);
...@@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT) if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
{ if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
u32 ack_ev_flags = CA_ACK_SLOWPATH; u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq) if (ack_seq != TCP_SKB_CB(skb)->end_seq)
...@@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) ...@@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->has_rxtstamp) if (TCP_SKB_CB(skb)->has_rxtstamp)
TCP_SKB_CB(skb)->swtstamp = skb->tstamp; TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk); inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
...@@ -4647,6 +4668,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) ...@@ -4647,6 +4668,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (tp->rx_opt.num_sacks) if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp); tcp_sack_remove(tp);
tcp_fast_path_check(sk);
if (eaten > 0) if (eaten > 0)
kfree_skb_partial(skb, fragstolen); kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD)) if (!sock_flag(sk, SOCK_DEAD))
...@@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk) ...@@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */ /* Massive buffer overcommit. */
tp->pred_flags = 0;
return -1; return -1;
} }
...@@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) ...@@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
tp->urg_data = TCP_URG_NOTYET; tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr; tp->urg_seq = ptr;
/* Disable header prediction. */
tp->pred_flags = 0;
} }
/* This is the 'fast' part of urgent handling. */ /* This is the 'fast' part of urgent handling. */
...@@ -5301,6 +5328,26 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, ...@@ -5301,6 +5328,26 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
/* /*
* TCP receive function for the ESTABLISHED state. * TCP receive function for the ESTABLISHED state.
*
* It is split into a fast path and a slow path. The fast path is
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
* - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
* (detected by checking the TCP header against pred_flags)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
* - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
* The first three cases are guaranteed by proper pred_flags setting,
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/ */
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th) const struct tcphdr *th)
...@@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, ...@@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_mstamp_refresh(tp); tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst)) if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
* The code loosely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
* on a device interrupt, to call tcp_recv function
* on the receive process context and checksum and copy
* the buffer to user space. smart...
*
* Our current scheme is not silly either but we take the
* extra cost of the net_bh soft interrupt processing...
* We do checksum and copy also but from device to kernel.
*/
tp->rx_opt.saw_tstamp = 0; tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
* match.
*/
/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
/* No? Slow path! */
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
/* If PAWS failed, check it more carefully in slow path */
if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
* and timestamp was corrupted part, it will result
* in a hung connection since we will drop all
* future packets due to the PAWS test.
*/
}
if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
/* We know that such packets are checksummed
* on entry.
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return;
} else { /* Header too small */
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = 0;
bool fragstolen = false;
if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
&fragstolen);
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
}
__tcp_ack_snd_check(sk, 0);
no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
sk->sk_data_ready(sk);
return;
}
}
slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb)) if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error; goto csum_error;
if (!th->ack && !th->rst && !th->syn) if (!th->ack && !th->rst && !th->syn)
goto discard; goto discard;
/*
* Standard slow path.
*/
if (!tcp_validate_incoming(sk, skb, th, 1)) if (!tcp_validate_incoming(sk, skb, th, 1))
return; return;
if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) step5:
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard; goto discard;
tcp_rcv_rtt_measure_ts(sk, skb); tcp_rcv_rtt_measure_ts(sk, skb);
...@@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) ...@@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
if (sock_flag(sk, SOCK_KEEPOPEN)) if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
else
tp->pred_flags = 0;
} }
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
...@@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_ecn_rcv_synack(tp, th); tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_ack(sk, skb, 0); tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Ok.. it's good. Set up sequence numbers and /* Ok.. it's good. Set up sequence numbers and
* move to established. * move to established.
...@@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ...@@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0; return 0;
/* step 5: check the ACK field */ /* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0; FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) { if (!acceptable) {
...@@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ...@@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32; tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk); tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
break; break;
case TCP_FIN_WAIT1: { case TCP_FIN_WAIT1: {
......
...@@ -436,6 +436,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -436,6 +436,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_sock *newtp = tcp_sk(newsk); struct tcp_sock *newtp = tcp_sk(newsk);
/* Now setup tcp_sock */ /* Now setup tcp_sock */
newtp->pred_flags = 0;
newtp->rcv_wup = newtp->copied_seq = newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1; newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->segs_in = 1; newtp->segs_in = 1;
......
...@@ -295,7 +295,9 @@ static u16 tcp_select_window(struct sock *sk) ...@@ -295,7 +295,9 @@ static u16 tcp_select_window(struct sock *sk)
/* RFC1323 scaling applied */ /* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale; new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
if (new_win == 0) { if (new_win == 0) {
tp->pred_flags = 0;
if (old_win) if (old_win)
NET_INC_STATS(sock_net(sk), NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPTOZEROWINDOWADV); LINUX_MIB_TCPTOZEROWINDOWADV);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment