Commit 99d1055c authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller

mptcp: wake-up readers only for in sequence data

Currently we rely on the subflow->data_avail field, which is subject to
races:

	ssk1
		skb len = 500 DSS(seq=1, len=1000, off=0)
		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL

	ssk2
		skb len = 500 DSS(seq = 501, len=1000)
		# data_avail == MPTCP_SUBFLOW_DATA_AVAIL

	ssk1
		skb len = 500 DSS(seq = 1, len=1000, off =500)
		# still data_avail == MPTCP_SUBFLOW_DATA_AVAIL,
		# as the skb is covered by a pre-existing map,
		# which was in-sequence at reception time.

Instead we can explicitly check if some has been received in-sequence,
propagating the info from __mptcp_move_skbs_from_subflow().

Additionally add the 'ONCE' annotation to the 'data_avail' memory
access, as msk will read it outside the subflow socket lock.

Fixes: 648ef4b8 ("mptcp: Implement MPTCP receive path")
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 72f96132
...@@ -670,15 +670,13 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) ...@@ -670,15 +670,13 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
/* In most cases we will be able to lock the mptcp socket. If its already /* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock. * owned, we need to defer to the work queue to avoid ABBA deadlock.
*/ */
static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{ {
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
unsigned int moved = 0; unsigned int moved = 0;
if (inet_sk_state_load(sk) == TCP_CLOSE) if (inet_sk_state_load(sk) == TCP_CLOSE)
return; return false;
mptcp_data_lock(sk);
__mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
__mptcp_ofo_queue(msk); __mptcp_ofo_queue(msk);
...@@ -690,7 +688,7 @@ static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) ...@@ -690,7 +688,7 @@ static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
*/ */
if (mptcp_pending_data_fin(sk, NULL)) if (mptcp_pending_data_fin(sk, NULL))
mptcp_schedule_work(sk); mptcp_schedule_work(sk);
mptcp_data_unlock(sk); return moved > 0;
} }
void mptcp_data_ready(struct sock *sk, struct sock *ssk) void mptcp_data_ready(struct sock *sk, struct sock *ssk)
...@@ -698,7 +696,6 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) ...@@ -698,7 +696,6 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
int sk_rbuf, ssk_rbuf; int sk_rbuf, ssk_rbuf;
bool wake;
/* The peer can send data while we are shutting down this /* The peer can send data while we are shutting down this
* subflow at msk destruction time, but we must avoid enqueuing * subflow at msk destruction time, but we must avoid enqueuing
...@@ -707,28 +704,22 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) ...@@ -707,28 +704,22 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
if (unlikely(subflow->disposable)) if (unlikely(subflow->disposable))
return; return;
/* move_skbs_to_msk below can legitly clear the data_avail flag,
* but we will need later to properly woke the reader, cache its
* value
*/
wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
if (wake)
set_bit(MPTCP_DATA_READY, &msk->flags);
ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
sk_rbuf = READ_ONCE(sk->sk_rcvbuf); sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
if (unlikely(ssk_rbuf > sk_rbuf)) if (unlikely(ssk_rbuf > sk_rbuf))
sk_rbuf = ssk_rbuf; sk_rbuf = ssk_rbuf;
/* over limit? can't append more skbs to msk */ /* over limit? can't append more skbs to msk, Also, no need to wake-up*/
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
goto wake; return;
move_skbs_to_msk(msk, ssk);
wake: /* Wake-up the reader only for in-sequence data */
if (wake) mptcp_data_lock(sk);
if (move_skbs_to_msk(msk, ssk)) {
set_bit(MPTCP_DATA_READY, &msk->flags);
sk->sk_data_ready(sk); sk->sk_data_ready(sk);
}
mptcp_data_unlock(sk);
} }
static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
...@@ -860,7 +851,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) ...@@ -860,7 +851,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
sock_owned_by_me(sk); sock_owned_by_me(sk);
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
if (subflow->data_avail) if (READ_ONCE(subflow->data_avail))
return mptcp_subflow_tcp_sock(subflow); return mptcp_subflow_tcp_sock(subflow);
} }
......
...@@ -362,7 +362,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk) ...@@ -362,7 +362,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
enum mptcp_data_avail { enum mptcp_data_avail {
MPTCP_SUBFLOW_NODATA, MPTCP_SUBFLOW_NODATA,
MPTCP_SUBFLOW_DATA_AVAIL, MPTCP_SUBFLOW_DATA_AVAIL,
MPTCP_SUBFLOW_OOO_DATA
}; };
struct mptcp_delegated_action { struct mptcp_delegated_action {
......
...@@ -1000,7 +1000,7 @@ static bool subflow_check_data_avail(struct sock *ssk) ...@@ -1000,7 +1000,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
struct sk_buff *skb; struct sk_buff *skb;
if (!skb_peek(&ssk->sk_receive_queue)) if (!skb_peek(&ssk->sk_receive_queue))
subflow->data_avail = 0; WRITE_ONCE(subflow->data_avail, 0);
if (subflow->data_avail) if (subflow->data_avail)
return true; return true;
...@@ -1039,18 +1039,13 @@ static bool subflow_check_data_avail(struct sock *ssk) ...@@ -1039,18 +1039,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
ack_seq = mptcp_subflow_get_mapped_dsn(subflow); ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
ack_seq); ack_seq);
if (ack_seq == old_ack) { if (unlikely(before64(ack_seq, old_ack))) {
subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
break; continue;
} else if (after64(ack_seq, old_ack)) {
subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
break;
} }
/* only accept in-sequence mapping. Old values are spurious WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
* retransmission break;
*/
mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
} }
return true; return true;
...@@ -1070,7 +1065,7 @@ static bool subflow_check_data_avail(struct sock *ssk) ...@@ -1070,7 +1065,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
subflow->reset_transient = 0; subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMPTCP; subflow->reset_reason = MPTCP_RST_EMPTCP;
tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_send_active_reset(ssk, GFP_ATOMIC);
subflow->data_avail = 0; WRITE_ONCE(subflow->data_avail, 0);
return false; return false;
} }
...@@ -1080,7 +1075,7 @@ static bool subflow_check_data_avail(struct sock *ssk) ...@@ -1080,7 +1075,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_seq = READ_ONCE(msk->ack_seq);
subflow->map_data_len = skb->len; subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
return true; return true;
} }
...@@ -1092,7 +1087,7 @@ bool mptcp_subflow_data_available(struct sock *sk) ...@@ -1092,7 +1087,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
if (subflow->map_valid && if (subflow->map_valid &&
mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
subflow->map_valid = 0; subflow->map_valid = 0;
subflow->data_avail = 0; WRITE_ONCE(subflow->data_avail, 0);
pr_debug("Done with mapping: seq=%u data_len=%u", pr_debug("Done with mapping: seq=%u data_len=%u",
subflow->map_subflow_seq, subflow->map_subflow_seq,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment