Commit 7f0c940b authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-msg_fastopen-and-tfo-listener-side-support'

Matthieu Baerts says:

====================
mptcp: MSG_FASTOPEN and TFO listener side support

Before this series, only the initiator of a connection was able to combine
both TCP FastOpen and MPTCP when using TCP_FASTOPEN_CONNECT socket option.

These new patches here add (in theory) the full support of TFO with MPTCP,
which means:

 - MSG_FASTOPEN sendmsg flag support (patch 1/8)
 - TFO support for the listener side (patches 2-5/8)
 - TCP_FASTOPEN socket option (patch 6/8)
 - TCP_FASTOPEN_KEY socket option (patch 7/8)

To support TFO for the server side, a few preparation patches are needed
(patches 2 to 5/8). Some of them were inspired by a previous work from
Benjamin Hesmans.

Note that TFO support with MPTCP has been validated with selftests
(patch 8/8) but also with Packetdrill tests running with a modified
but still very WIP version supporting MPTCP. Both the modified tool
and the tests are available online:

  https://github.com/multipath-tcp/packetdrill/
====================

Link: https://lore.kernel.org/r/20221125222958.958636-1-matthieu.baerts@tessares.netSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents f2bb566f ca7ae891
......@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
mib.o pm_netlink.o sockopt.o pm_userspace.o
mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
......
// SPDX-License-Identifier: GPL-2.0
/* MPTCP Fast Open Mechanism
*
* Copyright (c) 2021-2022, Dmytro SHYTYI
*/
#include "protocol.h"
void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
struct request_sock *req)
{
struct sock *ssk = subflow->tcp_sock;
struct sock *sk = subflow->conn;
struct sk_buff *skb;
struct tcp_sock *tp;
tp = tcp_sk(ssk);
subflow->is_mptfo = 1;
skb = skb_peek(&ssk->sk_receive_queue);
if (WARN_ON_ONCE(!skb))
return;
/* dequeue the skb from sk receive queue */
__skb_unlink(skb, &ssk->sk_receive_queue);
skb_ext_reset(skb);
skb_orphan(skb);
/* We copy the fastopen data, but that don't belong to the mptcp sequence
* space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset()
*/
tp->copied_seq += skb->len;
subflow->ssn_offset += skb->len;
/* initialize a dummy sequence number, we will update it at MPC
* completion, if needed
*/
MPTCP_SKB_CB(skb)->map_seq = -skb->len;
MPTCP_SKB_CB(skb)->end_seq = 0;
MPTCP_SKB_CB(skb)->offset = 0;
MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
mptcp_data_lock(sk);
mptcp_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk);
mptcp_data_unlock(sk);
}
void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt)
{
struct sock *sk = (struct sock *)msk;
struct sk_buff *skb;
mptcp_data_lock(sk);
skb = skb_peek_tail(&sk->sk_receive_queue);
if (skb) {
WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq);
pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk,
MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq,
MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq);
MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq;
MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq;
}
pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq);
mptcp_data_unlock(sk);
}
......@@ -26,6 +26,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
{
u8 subtype = *ptr >> 4;
int expected_opsize;
u16 subopt;
u8 version;
u8 flags;
u8 i;
......@@ -38,11 +39,15 @@ static void mptcp_parse_option(const struct sk_buff *skb,
expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
else
expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
subopt = OPTION_MPTCP_MPC_ACK;
} else {
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) {
expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
else
subopt = OPTION_MPTCP_MPC_SYNACK;
} else {
expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
subopt = OPTION_MPTCP_MPC_SYN;
}
}
/* Cfr RFC 8684 Section 3.3.0:
......@@ -85,7 +90,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0);
mp_opt->suboptions |= OPTIONS_MPTCP_MPC;
mp_opt->suboptions |= subopt;
if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
mp_opt->sndr_key = get_unaligned_be64(ptr);
ptr += 8;
......@@ -934,7 +939,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) &&
!subflow->request_join)
tcp_send_ack(ssk);
goto fully_established;
goto check_notify;
}
/* we must process OoO packets before the first subflow is fully
......@@ -945,17 +950,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
if (subflow->mp_join)
goto reset;
if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
goto set_fully_established;
return subflow->mp_capable;
}
if (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo)) {
if (subflow->remote_key_valid &&
(((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) {
/* subflows are fully established as soon as we get any
* additional ack, including ADD_ADDR.
*/
subflow->fully_established = 1;
WRITE_ONCE(msk->fully_established, true);
goto fully_established;
goto check_notify;
}
/* If the first established packet does not contain MP_CAPABLE + data
......@@ -974,11 +982,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (mp_opt->deny_join_id0)
WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
set_fully_established:
if (unlikely(!READ_ONCE(msk->pm.server_side)))
pr_warn_once("bogus mpc option on established client sk");
mptcp_subflow_fully_established(subflow, mp_opt);
fully_established:
check_notify:
/* if the subflow is not already linked into the conn_list, we can't
* notify the PM: this subflow is still on the listener queue
* and the PM possibly acquiring the subflow lock could race with
......
......@@ -36,15 +36,6 @@ struct mptcp6_sock {
};
#endif
struct mptcp_skb_cb {
u64 map_seq;
u64 end_seq;
u32 offset;
u8 has_rxtstamp:1;
};
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
enum {
MPTCP_CMSG_TS = BIT(0),
MPTCP_CMSG_INQ = BIT(1),
......@@ -200,7 +191,7 @@ static void mptcp_rfree(struct sk_buff *skb)
mptcp_rmem_uncharge(sk, len);
}
static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
skb->sk = sk;
......@@ -1711,17 +1702,14 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int ret = 0;
long timeo;
/* we don't support FASTOPEN yet */
if (msg->msg_flags & MSG_FASTOPEN)
return -EOPNOTSUPP;
/* silently ignore everything else */
msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN;
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) {
if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect ||
msg->msg_flags & MSG_FASTOPEN))) {
int copied_syn = 0;
ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn);
......@@ -3048,7 +3036,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
struct mptcp_sock *msk;
u64 ack_seq;
if (!nsk)
return NULL;
......@@ -3074,15 +3061,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) {
msk->can_ack = true;
msk->remote_key = mp_opt->sndr_key;
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
ack_seq++;
WRITE_ONCE(msk->ack_seq, ack_seq);
atomic64_set(&msk->rcv_wnd_sent, ack_seq);
}
sock_reset_flag(nsk, SOCK_RCU_FREE);
/* will be fully established after successful MPC subflow creation */
inet_sk_state_store(nsk, TCP_SYN_RECV);
......@@ -3355,7 +3333,6 @@ void mptcp_finish_connect(struct sock *ssk)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk;
struct sock *sk;
u64 ack_seq;
subflow = mptcp_subflow_ctx(ssk);
sk = subflow->conn;
......@@ -3363,22 +3340,16 @@ void mptcp_finish_connect(struct sock *ssk)
pr_debug("msk=%p, token=%u", sk, subflow->token);
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
ack_seq++;
subflow->map_seq = ack_seq;
subflow->map_seq = subflow->iasn;
subflow->map_subflow_seq = 1;
/* the socket is not connected yet, no msk/subflow ops can access/race
* accessing the field below
*/
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->local_key, subflow->local_key);
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
WRITE_ONCE(msk->snd_una, msk->write_seq);
atomic64_set(&msk->rcv_wnd_sent, ack_seq);
mptcp_pm_new_connection(msk, ssk, 0);
......
......@@ -126,6 +126,15 @@
#define MPTCP_CONNECTED 6
#define MPTCP_RESET_SCHEDULER 7
struct mptcp_skb_cb {
u64 map_seq;
u64 end_seq;
u32 offset;
u8 has_rxtstamp:1;
};
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
static inline bool before64(__u64 seq1, __u64 seq2)
{
return (__s64)(seq1 - seq2) < 0;
......@@ -467,17 +476,22 @@ struct mptcp_subflow_context {
send_fastclose : 1,
send_infinite_map : 1,
rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */
remote_key_valid : 1, /* received the peer key from */
disposable : 1, /* ctx can be free at ulp release time */
stale : 1, /* unable to snd/rcv data, do not use for xmit */
local_id_valid : 1, /* local_id is correctly initialized */
valid_csum_seen : 1; /* at least one csum validated */
valid_csum_seen : 1, /* at least one csum validated */
is_mptfo : 1, /* subflow is doing TFO */
__unused : 8;
enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
u32 remote_token;
u8 hmac[MPTCPOPT_HMAC_LEN];
union {
u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */
u64 iasn; /* initial ack sequence number, MPC subflows only */
};
u8 local_id;
u8 remote_id;
u8 reset_seen:1;
......@@ -603,7 +617,7 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
void mptcp_check_and_set_pending(struct sock *sk);
void __mptcp_push_pending(struct sock *sk, unsigned int flags);
......@@ -619,6 +633,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
void mptcp_cancel_work(struct sock *sk);
void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk);
bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
const struct mptcp_addr_info *b, bool use_port);
......@@ -826,6 +841,11 @@ void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_
void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id);
bool mptcp_userspace_pm_active(const struct mptcp_sock *msk);
void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt);
void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
struct request_sock *req);
static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
{
return READ_ONCE(msk->pm.addr_signal) &
......
......@@ -559,7 +559,9 @@ static bool mptcp_supported_sockopt(int level, int optname)
case TCP_NOTSENT_LOWAT:
case TCP_TX_DELAY:
case TCP_INQ:
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
return true;
}
......@@ -569,9 +571,6 @@ static bool mptcp_supported_sockopt(int level, int optname)
/* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
* TCP_REPAIR_WINDOW are not supported, better avoid this mess
*/
/* TCP_FASTOPEN_KEY, TCP_FASTOPEN are not supported because
* fastopen for the listener side is currently unsupported
*/
}
return false;
}
......@@ -801,7 +800,9 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
/* See tcp.c: TCP_DEFER_ACCEPT does not fail */
mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
return 0;
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname,
optval, optlen);
......@@ -1166,7 +1167,9 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
case TCP_INFO:
case TCP_CC_INFO:
case TCP_DEFER_ACCEPT:
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
optval, optlen);
......
......@@ -307,7 +307,48 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
return NULL;
}
static void subflow_prep_synack(const struct sock *sk, struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct inet_request_sock *ireq = inet_rsk(req);
/* clear tstamp_ok, as needed depending on cookie */
if (foc && foc->len > -1)
ireq->tstamp_ok = 0;
if (synack_type == TCP_SYNACK_FASTOPEN)
mptcp_fastopen_subflow_synack_set_params(subflow, req);
}
static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
subflow_prep_synack(sk, req, foc, synack_type);
return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc,
synack_type, syn_skb);
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
subflow_prep_synack(sk, req, foc, synack_type);
return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc,
synack_type, syn_skb);
}
static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
struct sk_buff *skb,
struct flowi *fl,
......@@ -392,11 +433,33 @@ static void mptcp_set_connected(struct sock *sk)
mptcp_data_unlock(sk);
}
static void subflow_set_remote_key(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt)
{
/* active MPC subflow will reach here multiple times:
* at subflow_finish_connect() time and at 4th ack time
*/
if (subflow->remote_key_valid)
return;
subflow->remote_key_valid = 1;
subflow->remote_key = mp_opt->sndr_key;
mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn);
subflow->iasn++;
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->ack_seq, subflow->iasn);
WRITE_ONCE(msk->can_ack, true);
atomic64_set(&msk->rcv_wnd_sent, subflow->iasn);
}
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_options_received mp_opt;
struct sock *parent = subflow->conn;
struct mptcp_sock *msk;
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
......@@ -404,6 +467,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (subflow->conn_finished)
return;
msk = mptcp_sk(parent);
mptcp_propagate_sndbuf(parent, sk);
subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
......@@ -416,19 +480,16 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
mptcp_do_fallback(sk);
pr_fallback(mptcp_sk(subflow->conn));
pr_fallback(msk);
goto fallback;
}
if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD)
WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true);
WRITE_ONCE(msk->csum_enabled, true);
if (mp_opt.deny_join_id0)
WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true);
WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
subflow->mp_capable = 1;
subflow->can_ack = 1;
subflow->remote_key = mp_opt.sndr_key;
pr_debug("subflow=%p, remote_key=%llu", subflow,
subflow->remote_key);
subflow_set_remote_key(msk, subflow, &mp_opt);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
mptcp_finish_connect(sk);
mptcp_set_connected(parent);
......@@ -466,7 +527,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
if (subflow_use_different_dport(msk, sk)) {
pr_debug("synack inet_dport=%d %d",
ntohs(inet_sk(sk)->inet_dport),
ntohs(inet_sk(parent)->inet_dport));
......@@ -474,7 +535,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
}
} else if (mptcp_check_fallback(sk)) {
fallback:
mptcp_rcv_space_init(mptcp_sk(parent), sk);
mptcp_rcv_space_init(msk, sk);
mptcp_set_connected(parent);
}
return;
......@@ -637,14 +698,16 @@ static void subflow_drop_ctx(struct sock *ssk)
}
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt)
const struct mptcp_options_received *mp_opt)
{
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
subflow->remote_key = mp_opt->sndr_key;
subflow_set_remote_key(msk, subflow, mp_opt);
subflow->fully_established = 1;
subflow->can_ack = 1;
WRITE_ONCE(msk->fully_established, true);
if (subflow->is_mptfo)
mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
}
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
......@@ -760,7 +823,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
/* with OoO packets we can reach here without ingress
* mpc option
*/
if (mp_opt.suboptions & OPTIONS_MPTCP_MPC)
if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)
mptcp_subflow_fully_established(ctx, &mp_opt);
} else if (ctx->mp_join) {
struct mptcp_sock *owner;
......@@ -1198,16 +1261,8 @@ static bool subflow_check_data_avail(struct sock *ssk)
if (WARN_ON_ONCE(!skb))
goto no_data;
/* if msk lacks the remote key, this subflow must provide an
* MP_CAPABLE-based mapping
*/
if (unlikely(!READ_ONCE(msk->can_ack))) {
if (!subflow->mpc_map)
if (unlikely(!READ_ONCE(msk->can_ack)))
goto fallback;
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->ack_seq, subflow->map_seq);
WRITE_ONCE(msk->can_ack, true);
}
old_ack = READ_ONCE(msk->ack_seq);
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
......@@ -1480,6 +1535,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
&flags, &ifindex);
subflow->remote_key_valid = 1;
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
subflow->token = msk->token;
......@@ -1873,6 +1929,7 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->mp_join = 1;
new_ctx->fully_established = 1;
new_ctx->remote_key_valid = 1;
new_ctx->backup = subflow_req->backup;
new_ctx->remote_id = subflow_req->remote_id;
new_ctx->token = subflow_req->token;
......@@ -1929,6 +1986,7 @@ void __init mptcp_subflow_init(void)
subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack;
subflow_specific = ipv4_specific;
subflow_specific.conn_request = subflow_v4_conn_request;
......@@ -1942,6 +2000,7 @@ void __init mptcp_subflow_init(void)
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack;
subflow_v6_specific = ipv6_specific;
subflow_v6_specific.conn_request = subflow_v6_conn_request;
......
......@@ -83,6 +83,7 @@ struct cfg_cmsg_types {
struct cfg_sockopt_types {
unsigned int transparent:1;
unsigned int mptfo:1;
};
struct tcp_inq_state {
......@@ -90,6 +91,13 @@ struct tcp_inq_state {
bool expect_eof;
};
struct wstate {
char buf[8192];
unsigned int len;
unsigned int off;
unsigned int total_len;
};
static struct tcp_inq_state tcp_inq;
static struct cfg_cmsg_types cfg_cmsg_types;
......@@ -232,6 +240,14 @@ static void set_transparent(int fd, int pf)
}
}
static void set_mptfo(int fd, int pf)
{
int qlen = 25;
if (setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)) == -1)
perror("TCP_FASTOPEN");
}
static int do_ulp_so(int sock, const char *name)
{
return setsockopt(sock, IPPROTO_TCP, TCP_ULP, name, strlen(name));
......@@ -300,6 +316,9 @@ static int sock_listen_mptcp(const char * const listenaddr,
if (cfg_sockopt_types.transparent)
set_transparent(sock, pf);
if (cfg_sockopt_types.mptfo)
set_mptfo(sock, pf);
if (bind(sock, a->ai_addr, a->ai_addrlen) == 0)
break; /* success */
......@@ -330,13 +349,15 @@ static int sock_listen_mptcp(const char * const listenaddr,
static int sock_connect_mptcp(const char * const remoteaddr,
const char * const port, int proto,
struct addrinfo **peer)
struct addrinfo **peer,
int infd, struct wstate *winfo)
{
struct addrinfo hints = {
.ai_protocol = IPPROTO_TCP,
.ai_socktype = SOCK_STREAM,
};
struct addrinfo *a, *addr;
int syn_copied = 0;
int sock = -1;
hints.ai_family = pf;
......@@ -354,15 +375,35 @@ static int sock_connect_mptcp(const char * const remoteaddr,
if (cfg_mark)
set_mark(sock, cfg_mark);
if (cfg_sockopt_types.mptfo) {
if (!winfo->total_len)
winfo->total_len = winfo->len = read(infd, winfo->buf,
sizeof(winfo->buf));
syn_copied = sendto(sock, winfo->buf, winfo->len, MSG_FASTOPEN,
a->ai_addr, a->ai_addrlen);
if (syn_copied >= 0) {
winfo->off = syn_copied;
winfo->len -= syn_copied;
*peer = a;
break; /* success */
}
} else {
if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) {
*peer = a;
break; /* success */
}
}
if (cfg_sockopt_types.mptfo) {
perror("sendto()");
close(sock);
sock = -1;
} else {
perror("connect()");
close(sock);
sock = -1;
}
}
freeaddrinfo(addr);
if (sock != -1)
......@@ -571,14 +612,14 @@ static void shut_wr(int fd)
shutdown(fd, SHUT_WR);
}
static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after_out)
static int copyfd_io_poll(int infd, int peerfd, int outfd,
bool *in_closed_after_out, struct wstate *winfo)
{
struct pollfd fds = {
.fd = peerfd,
.events = POLLIN | POLLOUT,
};
unsigned int woff = 0, wlen = 0, total_wlen = 0, total_rlen = 0;
char wbuf[8192];
unsigned int total_wlen = 0, total_rlen = 0;
set_nonblock(peerfd, true);
......@@ -638,19 +679,19 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
}
if (fds.revents & POLLOUT) {
if (wlen == 0) {
woff = 0;
wlen = read(infd, wbuf, sizeof(wbuf));
if (winfo->len == 0) {
winfo->off = 0;
winfo->len = read(infd, winfo->buf, sizeof(winfo->buf));
}
if (wlen > 0) {
if (winfo->len > 0) {
ssize_t bw;
/* limit the total amount of written data to the trunc value */
if (cfg_truncate > 0 && wlen + total_wlen > cfg_truncate)
wlen = cfg_truncate - total_wlen;
if (cfg_truncate > 0 && winfo->len + total_wlen > cfg_truncate)
winfo->len = cfg_truncate - total_wlen;
bw = do_rnd_write(peerfd, wbuf + woff, wlen);
bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len);
if (bw < 0) {
if (cfg_rcv_trunc)
return 0;
......@@ -658,10 +699,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
return 111;
}
woff += bw;
wlen -= bw;
winfo->off += bw;
winfo->len -= bw;
total_wlen += bw;
} else if (wlen == 0) {
} else if (winfo->len == 0) {
/* We have no more data to send. */
fds.events &= ~POLLOUT;
......@@ -717,10 +758,26 @@ static int do_recvfile(int infd, int outfd)
return (int)r;
}
static int do_mmap(int infd, int outfd, unsigned int size)
static int spool_buf(int fd, struct wstate *winfo)
{
while (winfo->len) {
int ret = write(fd, winfo->buf + winfo->off, winfo->len);
if (ret < 0) {
perror("write");
return 4;
}
winfo->off += ret;
winfo->len -= ret;
}
return 0;
}
static int do_mmap(int infd, int outfd, unsigned int size,
struct wstate *winfo)
{
char *inbuf = mmap(NULL, size, PROT_READ, MAP_SHARED, infd, 0);
ssize_t ret = 0, off = 0;
ssize_t ret = 0, off = winfo->total_len;
size_t rem;
if (inbuf == MAP_FAILED) {
......@@ -728,7 +785,11 @@ static int do_mmap(int infd, int outfd, unsigned int size)
return 1;
}
rem = size;
ret = spool_buf(outfd, winfo);
if (ret < 0)
return ret;
rem = size - winfo->total_len;
while (rem > 0) {
ret = write(outfd, inbuf + off, rem);
......@@ -772,8 +833,16 @@ static int get_infd_size(int fd)
return (int)count;
}
static int do_sendfile(int infd, int outfd, unsigned int count)
static int do_sendfile(int infd, int outfd, unsigned int count,
struct wstate *winfo)
{
int ret = spool_buf(outfd, winfo);
if (ret < 0)
return ret;
count -= winfo->total_len;
while (count > 0) {
ssize_t r;
......@@ -790,7 +859,8 @@ static int do_sendfile(int infd, int outfd, unsigned int count)
}
static int copyfd_io_mmap(int infd, int peerfd, int outfd,
unsigned int size, bool *in_closed_after_out)
unsigned int size, bool *in_closed_after_out,
struct wstate *winfo)
{
int err;
......@@ -799,9 +869,9 @@ static int copyfd_io_mmap(int infd, int peerfd, int outfd,
if (err)
return err;
err = do_mmap(infd, peerfd, size);
err = do_mmap(infd, peerfd, size, winfo);
} else {
err = do_mmap(infd, peerfd, size);
err = do_mmap(infd, peerfd, size, winfo);
if (err)
return err;
......@@ -815,7 +885,7 @@ static int copyfd_io_mmap(int infd, int peerfd, int outfd,
}
static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
unsigned int size, bool *in_closed_after_out)
unsigned int size, bool *in_closed_after_out, struct wstate *winfo)
{
int err;
......@@ -824,9 +894,9 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
if (err)
return err;
err = do_sendfile(infd, peerfd, size);
err = do_sendfile(infd, peerfd, size, winfo);
} else {
err = do_sendfile(infd, peerfd, size);
err = do_sendfile(infd, peerfd, size, winfo);
if (err)
return err;
......@@ -839,7 +909,7 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
return err;
}
static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd)
static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd, struct wstate *winfo)
{
bool in_closed_after_out = false;
struct timespec start, end;
......@@ -851,21 +921,24 @@ static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd)
switch (cfg_mode) {
case CFG_MODE_POLL:
ret = copyfd_io_poll(infd, peerfd, outfd, &in_closed_after_out);
ret = copyfd_io_poll(infd, peerfd, outfd, &in_closed_after_out,
winfo);
break;
case CFG_MODE_MMAP:
file_size = get_infd_size(infd);
if (file_size < 0)
return file_size;
ret = copyfd_io_mmap(infd, peerfd, outfd, file_size, &in_closed_after_out);
ret = copyfd_io_mmap(infd, peerfd, outfd, file_size,
&in_closed_after_out, winfo);
break;
case CFG_MODE_SENDFILE:
file_size = get_infd_size(infd);
if (file_size < 0)
return file_size;
ret = copyfd_io_sendfile(infd, peerfd, outfd, file_size, &in_closed_after_out);
ret = copyfd_io_sendfile(infd, peerfd, outfd, file_size,
&in_closed_after_out, winfo);
break;
default:
......@@ -999,6 +1072,7 @@ static void maybe_close(int fd)
int main_loop_s(int listensock)
{
struct sockaddr_storage ss;
struct wstate winfo;
struct pollfd polls;
socklen_t salen;
int remotesock;
......@@ -1033,7 +1107,8 @@ int main_loop_s(int listensock)
SOCK_TEST_TCPULP(remotesock, 0);
copyfd_io(fd, remotesock, 1, true);
memset(&winfo, 0, sizeof(winfo));
copyfd_io(fd, remotesock, 1, true, &winfo);
} else {
perror("accept");
return 1;
......@@ -1130,6 +1205,11 @@ static void parse_setsock_options(const char *name)
return;
}
if (strncmp(name, "MPTFO", len) == 0) {
cfg_sockopt_types.mptfo = 1;
return;
}
fprintf(stderr, "Unrecognized setsockopt option %s\n", name);
exit(1);
}
......@@ -1166,11 +1246,18 @@ void xdisconnect(int fd, int addrlen)
int main_loop(void)
{
int fd, ret, fd_in = 0;
int fd = 0, ret, fd_in = 0;
struct addrinfo *peer;
struct wstate winfo;
/* listener is ready. */
fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto, &peer);
if (cfg_input && cfg_sockopt_types.mptfo) {
fd_in = open(cfg_input, O_RDONLY);
if (fd < 0)
xerror("can't open %s:%d", cfg_input, errno);
}
memset(&winfo, 0, sizeof(winfo));
fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto, &peer, fd_in, &winfo);
if (fd < 0)
return 2;
......@@ -1186,14 +1273,13 @@ int main_loop(void)
if (cfg_cmsg_types.cmsg_enabled)
apply_cmsg_types(fd, &cfg_cmsg_types);
if (cfg_input) {
if (cfg_input && !cfg_sockopt_types.mptfo) {
fd_in = open(cfg_input, O_RDONLY);
if (fd < 0)
xerror("can't open %s:%d", cfg_input, errno);
}
/* close the client socket open only if we are not going to reconnect */
ret = copyfd_io(fd_in, fd, 1, 0);
ret = copyfd_io(fd_in, fd, 1, 0, &winfo);
if (ret)
return ret;
......@@ -1210,6 +1296,7 @@ int main_loop(void)
xerror("can't reconnect: %d", errno);
if (cfg_input)
close(fd_in);
memset(&winfo, 0, sizeof(winfo));
goto again;
} else {
close(fd);
......
......@@ -762,6 +762,23 @@ run_tests_peekmode()
run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}"
}
run_tests_mptfo()
{
echo "INFO: with MPTFO start"
ip netns exec "$ns1" sysctl -q net.ipv4.tcp_fastopen=2
ip netns exec "$ns2" sysctl -q net.ipv4.tcp_fastopen=1
run_tests_lo "$ns1" "$ns2" 10.0.1.1 0 "-o MPTFO"
run_tests_lo "$ns1" "$ns2" 10.0.1.1 0 "-o MPTFO"
run_tests_lo "$ns1" "$ns2" dead:beef:1::1 0 "-o MPTFO"
run_tests_lo "$ns1" "$ns2" dead:beef:1::1 0 "-o MPTFO"
ip netns exec "$ns1" sysctl -q net.ipv4.tcp_fastopen=0
ip netns exec "$ns2" sysctl -q net.ipv4.tcp_fastopen=0
echo "INFO: with MPTFO end"
}
run_tests_disconnect()
{
local peekmode="$1"
......@@ -901,6 +918,10 @@ run_tests_peekmode "saveWithPeek"
run_tests_peekmode "saveAfterPeek"
stop_if_error "Tests with peek mode have failed"
# MPTFO (MultiPath TCP Fatopen tests)
run_tests_mptfo
stop_if_error "Tests with MPTFO have failed"
# connect to ns4 ip address, ns2 should intercept/proxy
run_test_transparent 10.0.3.1 "tproxy ipv4"
run_test_transparent dead:beef:3::1 "tproxy ipv6"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment