Commit 3f8e0aae authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller

mptcp: rework mptcp_sendmsg_frag to accept optional dfrag

This will simplify mptcp-level retransmission implementation
in the next patch. If dfrag is provided by the caller, skip
kernel space memory allocation and use data and metadata
provided by the dfrag itself.

Because a peer could ack data at TCP level but refrain from
sending mptcp-level ACKs, we could grow the mptcp socket
backlog indefinitely.

We should thus block mptcp_sendmsg until the peer has acked some of the
sent data.

In order to be able to do so, increment the mptcp socket wmem_queued
counter on memory allocation and decrement it when releasing the memory
on mptcp-level ack reception.

Because TCP performns sndbuf auto-tuning up to tcp_wmem_max[2], make
this the mptcp sk_sndbuf limit.

In the future we could add experiment with autotuning as TCP does in
tcp_sndbuf_expand().

v2 -> v3:
 - remove 'inline' in foo.c files (David S. Miller)
Co-developed-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 7948f6cc
...@@ -316,15 +316,15 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) ...@@ -316,15 +316,15 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
return NULL; return NULL;
} }
static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, static bool mptcp_skb_can_collapse_to(u64 write_seq,
const struct sk_buff *skb, const struct sk_buff *skb,
const struct mptcp_ext *mpext) const struct mptcp_ext *mpext)
{ {
if (!tcp_skb_can_collapse_to(skb)) if (!tcp_skb_can_collapse_to(skb))
return false; return false;
/* can collapse only if MPTCP level sequence is in order */ /* can collapse only if MPTCP level sequence is in order */
return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; return mpext && mpext->data_seq + mpext->data_len == write_seq;
} }
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
...@@ -417,23 +417,28 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, ...@@ -417,23 +417,28 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
} }
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct msghdr *msg, long *timeo, int *pmss_now, struct msghdr *msg, struct mptcp_data_frag *dfrag,
long *timeo, int *pmss_now,
int *ps_goal) int *ps_goal)
{ {
int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
bool dfrag_collapsed, can_collapse = false; bool dfrag_collapsed, can_collapse = false;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL; struct mptcp_ext *mpext = NULL;
struct mptcp_data_frag *dfrag; bool retransmission = !!dfrag;
struct sk_buff *skb, *tail; struct sk_buff *skb, *tail;
struct page_frag *pfrag; struct page_frag *pfrag;
struct page *page;
u64 *write_seq;
size_t psize; size_t psize;
/* use the mptcp page cache so that we can easily move the data /* use the mptcp page cache so that we can easily move the data
* from one substream to another, but do per subflow memory accounting * from one substream to another, but do per subflow memory accounting
* Note: pfrag is used only !retransmission, but the compiler if
* fooled into a warning if we don't init here
*/ */
pfrag = sk_page_frag(sk); pfrag = sk_page_frag(sk);
while (!mptcp_page_frag_refill(ssk, pfrag) || while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
!mptcp_ext_cache_refill(msk)) { !mptcp_ext_cache_refill(msk)) {
ret = sk_stream_wait_memory(ssk, timeo); ret = sk_stream_wait_memory(ssk, timeo);
if (ret) if (ret)
...@@ -447,6 +452,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -447,6 +452,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
if (unlikely(__mptcp_needs_tcp_fallback(msk))) if (unlikely(__mptcp_needs_tcp_fallback(msk)))
return 0; return 0;
} }
if (!retransmission) {
write_seq = &msk->write_seq;
page = pfrag->page;
} else {
write_seq = &dfrag->data_seq;
page = dfrag->page;
}
/* compute copy limit */ /* compute copy limit */
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
...@@ -464,63 +476,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -464,63 +476,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* SSN association set here * SSN association set here
*/ */
can_collapse = (size_goal - skb->len > 0) && can_collapse = (size_goal - skb->len > 0) &&
mptcp_skb_can_collapse_to(msk, skb, mpext); mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
if (!can_collapse) if (!can_collapse)
TCP_SKB_CB(skb)->eor = 1; TCP_SKB_CB(skb)->eor = 1;
else else
avail_size = size_goal - skb->len; avail_size = size_goal - skb->len;
} }
/* reuse tail pfrag, if possible, or carve a new one from the page if (!retransmission) {
* allocator /* reuse tail pfrag, if possible, or carve a new one from the
*/ * page allocator
dfrag = mptcp_rtx_tail(sk); */
offset = pfrag->offset; dfrag = mptcp_rtx_tail(sk);
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); offset = pfrag->offset;
if (!dfrag_collapsed) { dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
dfrag = mptcp_carve_data_frag(msk, pfrag, offset); if (!dfrag_collapsed) {
dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
offset = dfrag->offset;
frag_truesize = dfrag->overhead;
}
psize = min_t(size_t, pfrag->size - offset, avail_size);
/* Copy to page */
pr_debug("left=%zu", msg_data_left(msg));
psize = copy_page_from_iter(pfrag->page, offset,
min_t(size_t, msg_data_left(msg),
psize),
&msg->msg_iter);
pr_debug("left=%zu", msg_data_left(msg));
if (!psize)
return -EINVAL;
if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
return -ENOMEM;
} else {
offset = dfrag->offset; offset = dfrag->offset;
frag_truesize = dfrag->overhead; psize = min_t(size_t, dfrag->data_len, avail_size);
} }
psize = min_t(size_t, pfrag->size - offset, avail_size);
/* Copy to page */
pr_debug("left=%zu", msg_data_left(msg));
psize = copy_page_from_iter(pfrag->page, offset,
min_t(size_t, msg_data_left(msg), psize),
&msg->msg_iter);
pr_debug("left=%zu", msg_data_left(msg));
if (!psize)
return -EINVAL;
if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
return -ENOMEM;
/* tell the TCP stack to delay the push so that we can safely /* tell the TCP stack to delay the push so that we can safely
* access the skb after the sendpages call * access the skb after the sendpages call
*/ */
ret = do_tcp_sendpages(ssk, pfrag->page, offset, psize, ret = do_tcp_sendpages(ssk, page, offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST); msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0) if (ret <= 0)
return ret; return ret;
frag_truesize += ret; frag_truesize += ret;
if (unlikely(ret < psize)) if (!retransmission) {
iov_iter_revert(&msg->msg_iter, psize - ret); if (unlikely(ret < psize))
iov_iter_revert(&msg->msg_iter, psize - ret);
/* send successful, keep track of sent data for mptcp-level /* send successful, keep track of sent data for mptcp-level
* retransmission * retransmission
*/ */
dfrag->data_len += ret; dfrag->data_len += ret;
if (!dfrag_collapsed) { if (!dfrag_collapsed) {
get_page(dfrag->page); get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue); list_add_tail(&dfrag->list, &msk->rtx_queue);
} sk_wmem_queued_add(sk, frag_truesize);
} else {
sk_wmem_queued_add(sk, ret);
}
/* charge data on mptcp rtx queue to the master socket /* charge data on mptcp rtx queue to the master socket
* Note: we charge such data both to sk and ssk * Note: we charge such data both to sk and ssk
*/ */
sk->sk_forward_alloc -= frag_truesize; sk->sk_forward_alloc -= frag_truesize;
}
/* if the tail skb extension is still the cached one, collapsing /* if the tail skb extension is still the cached one, collapsing
* really happened. Note: we can't check for 'same skb' as the sk_buff * really happened. Note: we can't check for 'same skb' as the sk_buff
...@@ -539,7 +562,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -539,7 +562,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
msk->cached_ext = NULL; msk->cached_ext = NULL;
memset(mpext, 0, sizeof(*mpext)); memset(mpext, 0, sizeof(*mpext));
mpext->data_seq = msk->write_seq; mpext->data_seq = *write_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret; mpext->data_len = ret;
mpext->use_map = 1; mpext->use_map = 1;
...@@ -550,8 +573,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -550,8 +573,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->dsn64); mpext->dsn64);
out: out:
pfrag->offset += frag_truesize; if (!retransmission)
msk->write_seq += ret; pfrag->offset += frag_truesize;
*write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
return ret; return ret;
...@@ -663,7 +687,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -663,7 +687,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(ssk); lock_sock(ssk);
while (msg_data_left(msg)) { while (msg_data_left(msg)) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal); &size_goal);
if (ret < 0) if (ret < 0)
break; break;
...@@ -974,6 +998,7 @@ static int mptcp_init_sock(struct sock *sk) ...@@ -974,6 +998,7 @@ static int mptcp_init_sock(struct sock *sk)
return ret; return ret;
sk_sockets_allocated_inc(sk); sk_sockets_allocated_inc(sk);
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
if (!mptcp_is_enabled(sock_net(sk))) if (!mptcp_is_enabled(sock_net(sk)))
return -ENOPROTOOPT; return -ENOPROTOOPT;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment