Commit 86f03776 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-oom-probe'

Menglong Dong says:

====================
net: tcp: support probing OOM

In this series, we make some small changes to make the tcp
retransmission become zero-window probes if the receiver drops the skb
because of memory pressure.

In the 1st patch, we reply a zero-window ACK if the skb is dropped
because out of memory, instead of dropping the skb silently.

In the 2nd patch, we allow a zero-window ACK to update the window.

In the 3rd patch, fix unexcepted socket die when snd_wnd is 0 in
tcp_retransmit_timer().

In the 4th patch, we refactor the debug message in
tcp_retransmit_timer() to make it more correct.

After these changes, the tcp can probe the OOM of the receiver forever.

Changes since v3:
- make the timeout "2 * TCP_RTO_MAX" in the 3rd patch
- tp->retrans_stamp is not based on jiffies and can't be compared with
  icsk->icsk_timeout in the 3rd patch. Fix it.
- introduce the 4th patch

Changes since v2:
- refactor the code to avoid code duplication in the 1st patch
- use after() instead of max() in tcp_rtx_probe0_timed_out()

Changes since v1:
- send 0 rwin ACK for the receive queue empty case when necessary in the
  1st patch
- send the ACK immediately by using the ICSK_ACK_NOW flag in the 1st
  patch
- consider the case of the connection restart from idle, as Neal comment,
  in the 3rd patch
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3e6860ec 031c44b7
...@@ -164,7 +164,8 @@ enum inet_csk_ack_state_t { ...@@ -164,7 +164,8 @@ enum inet_csk_ack_state_t {
ICSK_ACK_TIMER = 2, ICSK_ACK_TIMER = 2,
ICSK_ACK_PUSHED = 4, ICSK_ACK_PUSHED = 4,
ICSK_ACK_PUSHED2 = 8, ICSK_ACK_PUSHED2 = 8,
ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */
ICSK_ACK_NOMEM = 32,
}; };
void inet_csk_init_xmit_timers(struct sock *sk, void inet_csk_init_xmit_timers(struct sock *sk,
......
...@@ -3525,7 +3525,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, ...@@ -3525,7 +3525,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
{ {
return after(ack, tp->snd_una) || return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) || after(ack_seq, tp->snd_wl1) ||
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
} }
/* If we update tp->snd_una, also update tp->bytes_acked */ /* If we update tp->snd_una, also update tp->bytes_acked */
...@@ -5059,14 +5059,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) ...@@ -5059,14 +5059,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
/* Ok. In sequence. In window. */ /* Ok. In sequence. In window. */
queue_and_out: queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0) if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
sk_forced_mem_schedule(sk, skb->truesize); /* TODO: maybe ratelimit these WIN 0 ACK ? */
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { inet_csk(sk)->icsk_ack.pending |=
(ICSK_ACK_NOMEM | ICSK_ACK_NOW);
inet_csk_schedule_ack(sk);
sk->sk_data_ready(sk);
if (skb_queue_len(&sk->sk_receive_queue)) {
reason = SKB_DROP_REASON_PROTO_MEM; reason = SKB_DROP_REASON_PROTO_MEM;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
sk->sk_data_ready(sk);
goto drop; goto drop;
} }
sk_forced_mem_schedule(sk, skb->truesize);
}
eaten = tcp_queue_rcv(sk, skb, &fragstolen); eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len) if (skb->len)
......
...@@ -257,11 +257,19 @@ EXPORT_SYMBOL(tcp_select_initial_window); ...@@ -257,11 +257,19 @@ EXPORT_SYMBOL(tcp_select_initial_window);
static u16 tcp_select_window(struct sock *sk) static u16 tcp_select_window(struct sock *sk)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
u32 old_win = tp->rcv_wnd;
u32 cur_win = tcp_receive_window(tp);
u32 new_win = __tcp_select_window(sk);
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
u32 old_win = tp->rcv_wnd;
u32 cur_win, new_win;
/* Make the window 0 if we failed to queue the data because we
* are out of memory. The window is temporary, so we don't store
* it on the socket.
*/
if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
return 0;
cur_win = tcp_receive_window(tp);
new_win = __tcp_select_window(sk);
if (new_win < cur_win) { if (new_win < cur_win) {
/* Danger Will Robinson! /* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else * Don't update rcv_wup/rcv_wnd here or else
......
...@@ -454,6 +454,22 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) ...@@ -454,6 +454,22 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
req->timeout << req->num_timeout, TCP_RTO_MAX); req->timeout << req->num_timeout, TCP_RTO_MAX);
} }
static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const int timeout = TCP_RTO_MAX * 2;
u32 rcv_delta, rtx_delta;
rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
if (rcv_delta <= timeout)
return false;
rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) -
(tp->retrans_stamp ?: tcp_skb_timestamp(skb)));
return rtx_delta > timeout;
}
/** /**
* tcp_retransmit_timer() - The TCP retransmit timeout handler * tcp_retransmit_timer() - The TCP retransmit timeout handler
...@@ -503,23 +519,26 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -503,23 +519,26 @@ void tcp_retransmit_timer(struct sock *sk)
* we cannot allow such beasts to hang infinitely. * we cannot allow such beasts to hang infinitely.
*/ */
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
u32 rtx_delta;
rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb));
if (sk->sk_family == AF_INET) { if (sk->sk_family == AF_INET) {
net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
&inet->inet_daddr, &inet->inet_daddr, ntohs(inet->inet_dport),
ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt,
inet->inet_num, jiffies_to_msecs(jiffies - tp->rcv_tstamp),
tp->snd_una, tp->snd_nxt); rtx_delta);
} }
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) { else if (sk->sk_family == AF_INET6) {
net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
&sk->sk_v6_daddr, &sk->sk_v6_daddr, ntohs(inet->inet_dport),
ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt,
inet->inet_num, jiffies_to_msecs(jiffies - tp->rcv_tstamp),
tp->snd_una, tp->snd_nxt); rtx_delta);
} }
#endif #endif
if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { if (tcp_rtx_probe0_timed_out(sk, skb)) {
tcp_write_err(sk); tcp_write_err(sk);
goto out; goto out;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment