Commit 988998ac authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-tcp-fallback'

Mat Martineau says:

====================
mptcp: TCP fallback for established connections

RFC 8684 allows some MPTCP connections to fall back to regular TCP when
the MPTCP DSS checksum detects middlebox interference, there is only a
single subflow, and there is no unacknowledged out-of-sequence
data. When this condition is detected, the stack sends a MPTCP DSS
option with an "infinite mapping" to signal that a fallback is
happening, and the peers will stop sending MPTCP options in their TCP
headers. The Linux MPTCP stack has not yet supported this type of
fallback, instead closing the connection when the MPTCP checksum fails.

This series adds support for fallback to regular TCP in a more limited
scenario, for only MPTCP connections that have never connected
additional subflows or transmitted out-of-sequence data. The selftests
are also updated to check new MIBs that track infinite mappings.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 31693d02 8bd03be3
......@@ -35,7 +35,8 @@ struct mptcp_ext {
frozen:1,
reset_transient:1;
u8 reset_reason:4,
csum_reqd:1;
csum_reqd:1,
infinite_map:1;
};
#define MPTCP_RM_IDS_MAX 8
......
......@@ -84,6 +84,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext,
__field(u8, reset_transient)
__field(u8, reset_reason)
__field(u8, csum_reqd)
__field(u8, infinite_map)
),
TP_fast_assign(
......@@ -102,9 +103,10 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext,
__entry->reset_transient = mpext->reset_transient;
__entry->reset_reason = mpext->reset_reason;
__entry->csum_reqd = mpext->csum_reqd;
__entry->infinite_map = mpext->infinite_map;
),
TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u",
TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u infinite_map=%u",
__entry->data_ack, __entry->data_seq,
__entry->subflow_seq, __entry->data_len,
__entry->csum, __entry->use_map,
......@@ -112,7 +114,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext,
__entry->use_ack, __entry->ack64,
__entry->mpc_map, __entry->frozen,
__entry->reset_transient, __entry->reset_reason,
__entry->csum_reqd)
__entry->csum_reqd, __entry->infinite_map)
);
DEFINE_EVENT(mptcp_dump_mpext, mptcp_sendmsg_frag,
......
......@@ -24,6 +24,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH),
SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR),
......
......@@ -17,6 +17,7 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */
MPTCP_MIB_DATACSUMERR, /* The data checksum fail */
......
......@@ -825,7 +825,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
opts->suboptions = 0;
if (unlikely(__mptcp_check_fallback(msk)))
if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb)))
return false;
if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
......@@ -1340,8 +1340,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
put_unaligned_be32(mpext->subflow_seq, ptr);
ptr += 1;
if (opts->csum_reqd) {
/* data_len == 0 is reserved for the infinite mapping,
* the checksum will also be set to 0.
*/
put_unaligned_be32(mpext->data_len << 16 |
mptcp_make_csum(mpext), ptr);
(mpext->data_len ? mptcp_make_csum(mpext) : 0),
ptr);
} else {
put_unaligned_be32(mpext->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
......
......@@ -285,7 +285,13 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
pr_debug("fail_seq=%llu", fail_seq);
if (!mptcp_has_another_subflow(sk) && READ_ONCE(msk->allow_infinite_fallback))
subflow->send_infinite_map = 1;
}
/* path manager helpers */
......
......@@ -1229,6 +1229,22 @@ static void mptcp_update_data_checksum(struct sk_buff *skb, int added)
mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
}
static void mptcp_update_infinite_map(struct mptcp_sock *msk,
struct sock *ssk,
struct mptcp_ext *mpext)
{
if (!mpext)
return;
mpext->infinite_map = 1;
mpext->data_len = 0;
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
pr_fallback(msk);
__mptcp_do_fallback(msk);
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_data_frag *dfrag,
struct mptcp_sendmsg_info *info)
......@@ -1360,6 +1376,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
out:
if (READ_ONCE(msk->csum_enabled))
mptcp_update_data_checksum(skb, copy);
if (mptcp_subflow_ctx(ssk)->send_infinite_map)
mptcp_update_infinite_map(msk, ssk, mpext);
trace_mptcp_sendmsg_frag(mpext);
mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
return copy;
......@@ -2465,6 +2483,7 @@ static void __mptcp_retrans(struct sock *sk)
dfrag->already_sent = max(dfrag->already_sent, info.sent);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
WRITE_ONCE(msk->allow_infinite_fallback, false);
}
release_sock(ssk);
......@@ -2539,6 +2558,7 @@ static int __mptcp_init_sock(struct sock *sk)
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
WRITE_ONCE(msk->allow_infinite_fallback, true);
msk->recovery = false;
mptcp_pm_data_init(msk);
......@@ -3275,6 +3295,7 @@ bool mptcp_finish_join(struct sock *ssk)
}
subflow->map_seq = READ_ONCE(msk->ack_seq);
WRITE_ONCE(msk->allow_infinite_fallback, false);
out:
mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
......
......@@ -263,6 +263,7 @@ struct mptcp_sock {
bool rcv_fastclose;
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
bool csum_enabled;
bool allow_infinite_fallback;
u8 recvmsg_inq:1,
cork:1,
nodelay:1;
......@@ -440,6 +441,7 @@ struct mptcp_subflow_context {
send_mp_prio : 1,
send_mp_fail : 1,
send_fastclose : 1,
send_infinite_map : 1,
rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */
disposable : 1, /* ctx can be free at ulp release time */
......@@ -876,6 +878,17 @@ static inline void mptcp_do_fallback(struct sock *sk)
#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
{
struct mptcp_ext *mpext;
mpext = skb ? mptcp_get_ext(skb) : NULL;
if (mpext && mpext->infinite_map)
return true;
return false;
}
static inline bool subflow_simultaneous_connect(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
......
......@@ -1006,7 +1006,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
data_len = mpext->data_len;
if (data_len == 0) {
pr_debug("infinite mapping received");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
subflow->map_data_len = 0;
return MAPPING_INVALID;
}
......@@ -1203,35 +1205,39 @@ static bool subflow_check_data_avail(struct sock *ssk)
return false;
fallback:
/* RFC 8684 section 3.7. */
if (subflow->send_mp_fail) {
if (mptcp_has_another_subflow(ssk)) {
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
if (!__mptcp_check_fallback(msk)) {
/* RFC 8684 section 3.7. */
if (subflow->send_mp_fail) {
if (mptcp_has_another_subflow(ssk) ||
!READ_ONCE(msk->allow_infinite_fallback)) {
ssk->sk_err = EBADMSG;
tcp_set_state(ssk, TCP_CLOSE);
subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
tcp_send_active_reset(ssk, GFP_ATOMIC);
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
}
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return true;
}
ssk->sk_err = EBADMSG;
tcp_set_state(ssk, TCP_CLOSE);
subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
tcp_send_active_reset(ssk, GFP_ATOMIC);
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return true;
}
if (subflow->mp_join || subflow->fully_established) {
/* fatal protocol error, close the socket.
* subflow_error_report() will introduce the appropriate barriers
*/
ssk->sk_err = EBADMSG;
tcp_set_state(ssk, TCP_CLOSE);
subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMPTCP;
tcp_send_active_reset(ssk, GFP_ATOMIC);
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return false;
if ((subflow->mp_join || subflow->fully_established) && subflow->map_data_len) {
/* fatal protocol error, close the socket.
* subflow_error_report() will introduce the appropriate barriers
*/
ssk->sk_err = EBADMSG;
tcp_set_state(ssk, TCP_CLOSE);
subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMPTCP;
tcp_send_active_reset(ssk, GFP_ATOMIC);
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return false;
}
__mptcp_do_fallback(msk);
}
__mptcp_do_fallback(msk);
skb = skb_peek(&ssk->sk_receive_queue);
subflow->map_valid = 1;
subflow->map_seq = READ_ONCE(msk->ack_seq);
......@@ -1483,6 +1489,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
/* discard the subflow socket */
mptcp_sock_graft(ssk, sk->sk_socket);
iput(SOCK_INODE(sf));
WRITE_ONCE(msk->allow_infinite_fallback, false);
return err;
failed_unlink:
......
......@@ -1106,6 +1106,38 @@ chk_rst_nr()
echo "$extra_msg"
}
chk_infi_nr()
{
local infi_tx=$1
local infi_rx=$2
local count
local dump_stats
printf "%-${nr_blank}s %s" " " "itx"
count=$(ip netns exec $ns2 nstat -as | grep InfiniteMapTx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$infi_tx" ]; then
echo "[fail] got $count infinite map[s] TX expected $infi_tx"
fail_test
dump_stats=1
else
echo -n "[ ok ]"
fi
echo -n " - infirx"
count=$(ip netns exec $ns1 nstat -as | grep InfiniteMapRx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$infi_rx" ]; then
echo "[fail] got $count infinite map[s] RX expected $infi_rx"
fail_test
dump_stats=1
else
echo "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
}
chk_join_nr()
{
local syn_nr=$1
......@@ -1115,7 +1147,8 @@ chk_join_nr()
local csum_ns2=${5:-0}
local fail_nr=${6:-0}
local rst_nr=${7:-0}
local corrupted_pkts=${8:-0}
local infi_nr=${8:-0}
local corrupted_pkts=${9:-0}
local count
local dump_stats
local with_cookie
......@@ -1170,6 +1203,7 @@ chk_join_nr()
chk_csum_nr $csum_ns1 $csum_ns2
chk_fail_nr $fail_nr $fail_nr
chk_rst_nr $rst_nr $rst_nr
chk_infi_nr $infi_nr $infi_nr
fi
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment