Commit 0f4b437b authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'tcp-fix-tcp_poll-races'

Eric Dumazet says:

====================
tcp: fix tcp_poll() races

Flakes in packetdrill tests stressing epoll_wait()
were root caused to bad ordering in tcp_write_err()

Precisely, we have to call sk_error_report() after
tcp_done().

When fixing this issue, we discovered tcp_abort(),
tcp_v4_err() and tcp_v6_err() had similar issues.

Since tcp_reset() has the correct ordering,
first patch takes part of it and creates
tcp_done_with_error() helper.
====================

Link: https://lore.kernel.org/r/20240528125253.1966136-1-edumazet@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c3390677 fde6f897
...@@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, ...@@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
/* tcp_input.c */ /* tcp_input.c */
void tcp_rearm_rto(struct sock *sk); void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_done_with_error(struct sock *sk, int err);
void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_fin(struct sock *sk); void tcp_fin(struct sock *sk);
void tcp_check_space(struct sock *sk); void tcp_check_space(struct sock *sk);
......
...@@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) ...@@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
*/ */
mask |= EPOLLOUT | EPOLLWRNORM; mask |= EPOLLOUT | EPOLLWRNORM;
} }
/* This barrier is coupled with smp_wmb() in tcp_reset() */ /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
smp_rmb(); smp_rmb();
if (READ_ONCE(sk->sk_err) || if (READ_ONCE(sk->sk_err) ||
!skb_queue_empty_lockless(&sk->sk_error_queue)) !skb_queue_empty_lockless(&sk->sk_error_queue))
...@@ -4576,14 +4576,10 @@ int tcp_abort(struct sock *sk, int err) ...@@ -4576,14 +4576,10 @@ int tcp_abort(struct sock *sk, int err)
bh_lock_sock(sk); bh_lock_sock(sk);
if (!sock_flag(sk, SOCK_DEAD)) { if (!sock_flag(sk, SOCK_DEAD)) {
WRITE_ONCE(sk->sk_err, err);
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();
sk_error_report(sk);
if (tcp_need_reset(sk->sk_state)) if (tcp_need_reset(sk->sk_state))
tcp_send_active_reset(sk, GFP_ATOMIC, tcp_send_active_reset(sk, GFP_ATOMIC,
SK_RST_REASON_NOT_SPECIFIED); SK_RST_REASON_NOT_SPECIFIED);
tcp_done(sk); tcp_done_with_error(sk, err);
} }
bh_unlock_sock(sk); bh_unlock_sock(sk);
......
...@@ -4436,9 +4436,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, ...@@ -4436,9 +4436,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
return SKB_NOT_DROPPED_YET; return SKB_NOT_DROPPED_YET;
} }
void tcp_done_with_error(struct sock *sk, int err)
{
/* This barrier is coupled with smp_rmb() in tcp_poll() */
WRITE_ONCE(sk->sk_err, err);
smp_wmb();
tcp_write_queue_purge(sk);
tcp_done(sk);
if (!sock_flag(sk, SOCK_DEAD))
sk_error_report(sk);
}
EXPORT_SYMBOL(tcp_done_with_error);
/* When we get a reset we do this. */ /* When we get a reset we do this. */
void tcp_reset(struct sock *sk, struct sk_buff *skb) void tcp_reset(struct sock *sk, struct sk_buff *skb)
{ {
int err;
trace_tcp_receive_reset(sk); trace_tcp_receive_reset(sk);
/* mptcp can't tell us to ignore reset pkts, /* mptcp can't tell us to ignore reset pkts,
...@@ -4450,24 +4467,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) ...@@ -4450,24 +4467,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
/* We want the right error as BSD sees it (and indeed as we do). */ /* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) { switch (sk->sk_state) {
case TCP_SYN_SENT: case TCP_SYN_SENT:
WRITE_ONCE(sk->sk_err, ECONNREFUSED); err = ECONNREFUSED;
break; break;
case TCP_CLOSE_WAIT: case TCP_CLOSE_WAIT:
WRITE_ONCE(sk->sk_err, EPIPE); err = EPIPE;
break; break;
case TCP_CLOSE: case TCP_CLOSE:
return; return;
default: default:
WRITE_ONCE(sk->sk_err, ECONNRESET); err = ECONNRESET;
} }
/* This barrier is coupled with smp_rmb() in tcp_poll() */ tcp_done_with_error(sk, err);
smp_wmb();
tcp_write_queue_purge(sk);
tcp_done(sk);
if (!sock_flag(sk, SOCK_DEAD))
sk_error_report(sk);
} }
/* /*
......
...@@ -611,15 +611,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) ...@@ -611,15 +611,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
if (!sock_owned_by_user(sk)) { if (!sock_owned_by_user(sk))
WRITE_ONCE(sk->sk_err, err); tcp_done_with_error(sk, err);
else
sk_error_report(sk);
tcp_done(sk);
} else {
WRITE_ONCE(sk->sk_err_soft, err); WRITE_ONCE(sk->sk_err_soft, err);
}
goto out; goto out;
} }
......
...@@ -74,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) ...@@ -74,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
static void tcp_write_err(struct sock *sk) static void tcp_write_err(struct sock *sk)
{ {
WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
sk_error_report(sk);
tcp_write_queue_purge(sk);
tcp_done(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
} }
......
...@@ -490,14 +490,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -490,14 +490,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th);
if (!sock_owned_by_user(sk)) { if (!sock_owned_by_user(sk))
WRITE_ONCE(sk->sk_err, err); tcp_done_with_error(sk, err);
sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ else
tcp_done(sk);
} else {
WRITE_ONCE(sk->sk_err_soft, err); WRITE_ONCE(sk->sk_err_soft, err);
}
goto out; goto out;
case TCP_LISTEN: case TCP_LISTEN:
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment