Commit a778e93d authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-dss-checksums'

Mat Martineau says:

====================
mptcp: DSS checksum support

RFC 8684 defines a DSS checksum feature that allows MPTCP to detect
middlebox interference with the MPTCP DSS header and the portion of the
data stream associated with that header. So far, the MPTCP
implementation in the Linux kernel has not supported this feature.

This patch series adds DSS checksum support. By default, the kernel will
not request checksums when sending SYN or SYN/ACK packets for MPTCP
connections. Outgoing checksum requests can be enabled with a
per-namespace net.mptcp.checksum_enabled sysctl. MPTCP connections will
now proceed with DSS checksums when the peer requests them, whether the
sysctl is enabled or not.

Patches 1-5 add checksum bits to the outgoing SYN, SYN/ACK, and data
packet headers. This includes calculating the checksum using a range of
data and the MPTCP DSS mapping for that data.

Patches 6-10 handle the checksum request in the SYN or SYN/ACK, and
receiving and verifying the DSS checksum on data packets.

Patch 11 adjusts the MPTCP-level retransmission process for checksum
compatibility.

Patches 12-14 add checksum-related MIBs, the net.mptcp.checksum_enabled
sysctl, and a checksum field to debug trace output.

Patches 15 & 16 add selftests.

The series is slightly longer than the preferred 15-patch limit that
patchwork warns about. I do try to stay below that whenever possible -
this series does implement one feature and is, I think, cohesive enough
to justify keeping it together. If it's at all problematic please let me
know!

A trivial merge conflict with net/master is introduced in patch 15: a
commit in net/master removes a couple of nearby lines of code.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e7f3863c af66d3e1
......@@ -24,3 +24,11 @@ add_addr_timeout - INTEGER (seconds)
sysctl.
Default: 120
checksum_enabled - BOOLEAN
Control whether DSS checksum can be enabled.
DSS checksum can be enabled if the value is nonzero. This is a
per-namespace sysctl.
Default: 0
......@@ -23,6 +23,7 @@ struct mptcp_ext {
u64 data_seq;
u32 subflow_seq;
u16 data_len;
__sum16 csum;
u8 use_map:1,
dsn64:1,
data_fin:1,
......@@ -31,7 +32,8 @@ struct mptcp_ext {
mpc_map:1,
frozen:1,
reset_transient:1;
u8 reset_reason:4;
u8 reset_reason:4,
csum_reqd:1;
};
#define MPTCP_RM_IDS_MAX 8
......@@ -63,8 +65,9 @@ struct mptcp_out_options {
struct mptcp_rm_list rm_list;
u8 join_id;
u8 backup;
u8 reset_reason:4;
u8 reset_transient:1;
u8 reset_reason:4,
reset_transient:1,
csum_reqd:1;
u32 nonce;
u64 thmac;
u32 token;
......
......@@ -73,6 +73,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext,
__field(u64, data_seq)
__field(u32, subflow_seq)
__field(u16, data_len)
__field(u16, csum)
__field(u8, use_map)
__field(u8, dsn64)
__field(u8, data_fin)
......@@ -82,6 +83,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext,
__field(u8, frozen)
__field(u8, reset_transient)
__field(u8, reset_reason)
__field(u8, csum_reqd)
),
TP_fast_assign(
......@@ -89,6 +91,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext,
__entry->data_seq = mpext->data_seq;
__entry->subflow_seq = mpext->subflow_seq;
__entry->data_len = mpext->data_len;
__entry->csum = (__force u16)mpext->csum;
__entry->use_map = mpext->use_map;
__entry->dsn64 = mpext->dsn64;
__entry->data_fin = mpext->data_fin;
......@@ -98,16 +101,18 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext,
__entry->frozen = mpext->frozen;
__entry->reset_transient = mpext->reset_transient;
__entry->reset_reason = mpext->reset_reason;
__entry->csum_reqd = mpext->csum_reqd;
),
TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u",
TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u",
__entry->data_ack, __entry->data_seq,
__entry->subflow_seq, __entry->data_len,
__entry->use_map, __entry->dsn64,
__entry->data_fin, __entry->use_ack,
__entry->ack64, __entry->mpc_map,
__entry->frozen, __entry->reset_transient,
__entry->reset_reason)
__entry->csum, __entry->use_map,
__entry->dsn64, __entry->data_fin,
__entry->use_ack, __entry->ack64,
__entry->mpc_map, __entry->frozen,
__entry->reset_transient, __entry->reset_reason,
__entry->csum_reqd)
);
DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status,
......
......@@ -105,6 +105,7 @@ struct mptcp_info {
__u64 mptcpi_rcv_nxt;
__u8 mptcpi_local_addr_used;
__u8 mptcpi_local_addr_max;
__u8 mptcpi_csum_enabled;
};
/*
......
......@@ -23,6 +23,7 @@ struct mptcp_pernet {
u8 mptcp_enabled;
unsigned int add_addr_timeout;
u8 checksum_enabled;
};
static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
......@@ -40,10 +41,16 @@ unsigned int mptcp_get_add_addr_timeout(struct net *net)
return mptcp_get_pernet(net)->add_addr_timeout;
}
int mptcp_is_checksum_enabled(struct net *net)
{
return mptcp_get_pernet(net)->checksum_enabled;
}
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
pernet->checksum_enabled = 0;
}
#ifdef CONFIG_SYSCTL
......@@ -65,6 +72,14 @@ static struct ctl_table mptcp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "checksum_enabled",
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE
},
{}
};
......@@ -82,6 +97,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[0].data = &pernet->mptcp_enabled;
table[1].data = &pernet->add_addr_timeout;
table[2].data = &pernet->checksum_enabled;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
......
......@@ -25,6 +25,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR),
SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL),
SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE),
SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE),
......
......@@ -18,6 +18,7 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
MPTCP_MIB_DATACSUMERR, /* The data checksum fail */
MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */
MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */
MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */
......
......@@ -144,6 +144,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
unlock_sock_fast(sk, slow);
}
......
......@@ -44,7 +44,20 @@ static void mptcp_parse_option(const struct sk_buff *skb,
else
expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
}
if (opsize != expected_opsize)
/* Cfr RFC 8684 Section 3.3.0:
* If a checksum is present but its use had
* not been negotiated in the MP_CAPABLE handshake, the receiver MUST
* close the subflow with a RST, as it is not behaving as negotiated.
* If a checksum is not present when its use has been negotiated, the
* receiver MUST close the subflow with a RST, as it is considered
* broken
* We parse even option with mismatching csum presence, so that
* later in subflow_data_ready we can trigger the reset.
*/
if (opsize != expected_opsize &&
(expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA ||
opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM))
break;
/* try to be gentle vs future versions on the initial syn */
......@@ -66,16 +79,9 @@ static void mptcp_parse_option(const struct sk_buff *skb,
* host requires the use of checksums, checksums MUST be used.
* In other words, the only way for checksums not to be used
* is if both hosts in their SYNs set A=0."
*
* Section 3.3.0:
* "If a checksum is not present when its use has been
* negotiated, the receiver MUST close the subflow with a RST as
* it is considered broken."
*
* We don't implement DSS checksum - fall back to TCP.
*/
if (flags & MPTCP_CAP_CHECKSUM_REQD)
break;
mp_opt->csum_reqd = 1;
mp_opt->mp_capable = 1;
if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
......@@ -86,7 +92,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->rcvr_key = get_unaligned_be64(ptr);
ptr += 8;
}
if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) {
/* Section 3.1.:
* "the data parameters in a MP_CAPABLE are semantically
* equivalent to those in a DSS option and can be used
......@@ -98,9 +104,14 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->data_len = get_unaligned_be16(ptr);
ptr += 2;
}
pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) {
mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr);
mp_opt->csum_reqd = 1;
ptr += 2;
}
pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u",
version, flags, opsize, mp_opt->sndr_key,
mp_opt->rcvr_key, mp_opt->data_len);
mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum);
break;
case MPTCPOPT_MP_JOIN:
......@@ -171,10 +182,8 @@ static void mptcp_parse_option(const struct sk_buff *skb,
expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
}
/* RFC 6824, Section 3.3:
* If a checksum is present, but its use had
* not been negotiated in the MP_CAPABLE handshake,
* the checksum field MUST be ignored.
/* Always parse any csum presence combination, we will enforce
* RFC 8684 Section 3.3.0 checks later in subflow_data_ready
*/
if (opsize != expected_opsize &&
opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
......@@ -209,9 +218,15 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->data_len = get_unaligned_be16(ptr);
ptr += 2;
pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) {
mp_opt->csum_reqd = 1;
mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr);
ptr += 2;
}
pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
mp_opt->data_seq, mp_opt->subflow_seq,
mp_opt->data_len);
mp_opt->data_len, mp_opt->csum_reqd, mp_opt->csum);
}
break;
......@@ -323,9 +338,12 @@ static void mptcp_parse_option(const struct sk_buff *skb,
}
}
void mptcp_get_options(const struct sk_buff *skb,
void mptcp_get_options(const struct sock *sk,
const struct sk_buff *skb,
struct mptcp_options_received *mp_opt)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
const struct tcphdr *th = tcp_hdr(skb);
const unsigned char *ptr;
int length;
......@@ -341,6 +359,7 @@ void mptcp_get_options(const struct sk_buff *skb,
mp_opt->dss = 0;
mp_opt->mp_prio = 0;
mp_opt->reset = 0;
mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled);
length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
......@@ -380,6 +399,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
if (subflow->request_mptcp) {
opts->suboptions = OPTION_MPTCP_MPC_SYN;
opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk));
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
} else if (subflow->request_join) {
......@@ -435,8 +455,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_ext *mpext;
unsigned int data_len;
u8 len;
/* When skb is not available, we better over-estimate the emitted
* options len. A full DSS option (28 bytes) is longer than
......@@ -465,16 +487,26 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
opts->suboptions = OPTION_MPTCP_MPC_ACK;
opts->sndr_key = subflow->local_key;
opts->rcvr_key = subflow->remote_key;
opts->csum_reqd = READ_ONCE(msk->csum_enabled);
/* Section 3.1.
* The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
* packets that start the first subflow of an MPTCP connection,
* as well as the first packet that carries data
*/
if (data_len > 0)
*size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
else
if (data_len > 0) {
len = TCPOLEN_MPTCP_MPC_ACK_DATA;
if (opts->csum_reqd) {
/* we need to propagate more info to csum the pseudo hdr */
opts->ext_copy.data_seq = mpext->data_seq;
opts->ext_copy.subflow_seq = mpext->subflow_seq;
opts->ext_copy.csum = mpext->csum;
len += TCPOLEN_MPTCP_DSS_CHECKSUM;
}
*size = ALIGN(len, 4);
} else {
*size = TCPOLEN_MPTCP_MPC_ACK;
}
pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
subflow, subflow->local_key, subflow->remote_key,
......@@ -535,18 +567,21 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
bool ret = false;
u64 ack_seq;
opts->csum_reqd = READ_ONCE(msk->csum_enabled);
mpext = skb ? mptcp_get_ext(skb) : NULL;
if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
unsigned int map_size;
unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
if (mpext) {
if (opts->csum_reqd)
map_size += TCPOLEN_MPTCP_DSS_CHECKSUM;
remaining -= map_size;
dss_size = map_size;
if (mpext)
opts->ext_copy = *mpext;
}
remaining -= map_size;
dss_size = map_size;
if (skb && snd_data_fin_enable)
mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
ret = true;
......@@ -789,6 +824,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
if (subflow_req->mp_capable) {
opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
opts->sndr_key = subflow_req->local_key;
opts->csum_reqd = subflow_req->csum_reqd;
*size = TCPOLEN_MPTCP_MPC_SYNACK;
pr_debug("subflow_req=%p, local_key=%llu",
subflow_req, subflow_req->local_key);
......@@ -1007,7 +1043,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return;
}
mptcp_get_options(skb, &mp_opt);
mptcp_get_options(sk, skb, &mp_opt);
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return;
......@@ -1099,6 +1135,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
}
mpext->data_len = mp_opt.data_len;
mpext->use_map = 1;
mpext->csum_reqd = mp_opt.csum_reqd;
if (mpext->csum_reqd)
mpext->csum = mp_opt.csum;
}
}
......@@ -1118,25 +1158,50 @@ static void mptcp_set_rwin(const struct tcp_sock *tp)
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
}
static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
{
struct csum_pseudo_header header;
__wsum csum;
/* cfr RFC 8684 3.3.1.:
* the data sequence number used in the pseudo-header is
* always the 64-bit value, irrespective of what length is used in the
* DSS option itself.
*/
header.data_seq = cpu_to_be64(mpext->data_seq);
header.subflow_seq = htonl(mpext->subflow_seq);
header.data_len = htons(mpext->data_len);
header.csum = 0;
csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum));
return (__force u16)csum_fold(csum);
}
void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
struct mptcp_out_options *opts)
{
if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
u8 len;
u8 len, flag = MPTCP_CAP_HMAC_SHA256;
if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
if (OPTION_MPTCP_MPC_SYN & opts->suboptions) {
len = TCPOLEN_MPTCP_MPC_SYN;
else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
} else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) {
len = TCPOLEN_MPTCP_MPC_SYNACK;
else if (opts->ext_copy.data_len)
} else if (opts->ext_copy.data_len) {
len = TCPOLEN_MPTCP_MPC_ACK_DATA;
else
if (opts->csum_reqd)
len += TCPOLEN_MPTCP_DSS_CHECKSUM;
} else {
len = TCPOLEN_MPTCP_MPC_ACK;
}
if (opts->csum_reqd)
flag |= MPTCP_CAP_CHECKSUM_REQD;
*ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
MPTCP_SUPPORTED_VERSION,
MPTCP_CAP_HMAC_SHA256);
flag);
if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
opts->suboptions))
......@@ -1152,8 +1217,13 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
if (!opts->ext_copy.data_len)
goto mp_capable_done;
put_unaligned_be32(opts->ext_copy.data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
if (opts->csum_reqd) {
put_unaligned_be32(opts->ext_copy.data_len << 16 |
mptcp_make_csum(&opts->ext_copy), ptr);
} else {
put_unaligned_be32(opts->ext_copy.data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
}
ptr += 1;
}
......@@ -1305,6 +1375,9 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
if (mpext->data_fin)
flags |= MPTCP_DSS_DATA_FIN;
if (opts->csum_reqd)
len += TCPOLEN_MPTCP_DSS_CHECKSUM;
}
*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
......@@ -1324,8 +1397,13 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
ptr += 2;
put_unaligned_be32(mpext->subflow_seq, ptr);
ptr += 1;
put_unaligned_be32(mpext->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
if (opts->csum_reqd) {
put_unaligned_be32(mpext->data_len << 16 |
mptcp_make_csum(mpext), ptr);
} else {
put_unaligned_be32(mpext->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
}
}
}
......
......@@ -1308,6 +1308,18 @@ static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk)
return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation);
}
/* note: this always recompute the csum on the whole skb, even
* if we just appended a single frag. More status info needed
*/
static void mptcp_update_data_checksum(struct sk_buff *skb, int added)
{
struct mptcp_ext *mpext = mptcp_get_ext(skb);
__wsum csum = ~csum_unfold(mpext->csum);
int offset = skb->len - added;
mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_data_frag *dfrag,
struct mptcp_sendmsg_info *info)
......@@ -1402,10 +1414,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
if (zero_window_probe) {
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
mpext->frozen = 1;
ret = 0;
if (READ_ONCE(msk->csum_enabled))
mptcp_update_data_checksum(tail, ret);
tcp_push_pending_frames(ssk);
return 0;
}
out:
if (READ_ONCE(msk->csum_enabled))
mptcp_update_data_checksum(tail, ret);
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
return ret;
}
......@@ -2359,8 +2375,8 @@ static void __mptcp_retrans(struct sock *sk)
/* limit retransmission to the bytes already sent on some subflows */
info.sent = 0;
info.limit = dfrag->already_sent;
while (info.sent < dfrag->already_sent) {
info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
while (info.sent < info.limit) {
if (!mptcp_alloc_tx_skb(sk, ssk))
break;
......@@ -2372,9 +2388,11 @@ static void __mptcp_retrans(struct sock *sk)
copied += ret;
info.sent += ret;
}
if (copied)
if (copied) {
dfrag->already_sent = max(dfrag->already_sent, info.sent);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
}
mptcp_set_timeout(sk, ssk);
release_sock(ssk);
......@@ -2453,6 +2471,7 @@ static int __mptcp_init_sock(struct sock *sk)
msk->ack_hint = NULL;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
mptcp_pm_data_init(msk);
......@@ -2793,6 +2812,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->token = subflow_req->token;
msk->subflow = NULL;
WRITE_ONCE(msk->fully_established, false);
if (mp_opt->csum_reqd)
WRITE_ONCE(msk->csum_enabled, true);
msk->write_seq = subflow_req->idsn + 1;
msk->snd_nxt = msk->write_seq;
......
......@@ -68,6 +68,8 @@
#define TCPOLEN_MPTCP_FASTCLOSE 12
#define TCPOLEN_MPTCP_RST 4
#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA)
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
#define MPTCPOPT_HMAC_LEN 20
......@@ -124,6 +126,7 @@ struct mptcp_options_received {
u64 data_seq;
u32 subflow_seq;
u16 data_len;
__sum16 csum;
u16 mp_capable : 1,
mp_join : 1,
fastclose : 1,
......@@ -133,6 +136,7 @@ struct mptcp_options_received {
rm_addr : 1,
mp_prio : 1,
echo : 1,
csum_reqd : 1,
backup : 1;
u32 token;
u32 nonce;
......@@ -234,6 +238,7 @@ struct mptcp_sock {
bool snd_data_fin_enable;
bool rcv_fastclose;
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
bool csum_enabled;
spinlock_t join_list_lock;
struct sock *ack_hint;
struct work_struct work;
......@@ -335,11 +340,19 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
}
struct csum_pseudo_header {
__be64 data_seq;
__be32 subflow_seq;
__be16 data_len;
__sum16 csum;
};
struct mptcp_subflow_request_sock {
struct tcp_request_sock sk;
u16 mp_capable : 1,
mp_join : 1,
backup : 1;
backup : 1,
csum_reqd : 1;
u8 local_id;
u8 remote_id;
u64 local_key;
......@@ -387,6 +400,8 @@ struct mptcp_subflow_context {
u32 map_subflow_seq;
u32 ssn_offset;
u32 map_data_len;
__wsum map_data_csum;
u32 map_csum_len;
u32 request_mptcp : 1, /* send MP_CAPABLE */
request_join : 1, /* send MP_JOIN */
request_bkup : 1,
......@@ -396,6 +411,8 @@ struct mptcp_subflow_context {
pm_notified : 1, /* PM hook called for established status */
conn_finished : 1,
map_valid : 1,
map_csum_reqd : 1,
map_data_fin : 1,
mpc_map : 1,
backup : 1,
send_mp_prio : 1,
......@@ -525,6 +542,7 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su
int mptcp_is_enabled(struct net *net);
unsigned int mptcp_get_add_addr_timeout(struct net *net);
int mptcp_is_checksum_enabled(struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool mptcp_subflow_data_available(struct sock *sk);
......@@ -576,7 +594,8 @@ int __init mptcp_proto_v6_init(void);
struct sock *mptcp_sk_clone(const struct sock *sk,
const struct mptcp_options_received *mp_opt,
struct request_sock *req);
void mptcp_get_options(const struct sk_buff *skb,
void mptcp_get_options(const struct sock *sk,
const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
......
......@@ -108,6 +108,7 @@ static void subflow_init_req(struct request_sock *req, const struct sock *sk_lis
subflow_req->mp_capable = 0;
subflow_req->mp_join = 0;
subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener));
subflow_req->msk = NULL;
mptcp_token_init_request(req);
}
......@@ -150,7 +151,7 @@ static int subflow_check_req(struct request_sock *req,
return -EINVAL;
#endif
mptcp_get_options(skb, &mp_opt);
mptcp_get_options(sk_listener, skb, &mp_opt);
if (mp_opt.mp_capable) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
......@@ -247,7 +248,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
int err;
subflow_init_req(req, sk_listener);
mptcp_get_options(skb, &mp_opt);
mptcp_get_options(sk_listener, skb, &mp_opt);
if (mp_opt.mp_capable && mp_opt.mp_join)
return -EINVAL;
......@@ -394,7 +395,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
mptcp_get_options(skb, &mp_opt);
mptcp_get_options(sk, skb, &mp_opt);
if (subflow->request_mptcp) {
if (!mp_opt.mp_capable) {
MPTCP_INC_STATS(sock_net(sk),
......@@ -404,6 +405,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
goto fallback;
}
if (mp_opt.csum_reqd)
WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true);
subflow->mp_capable = 1;
subflow->can_ack = 1;
subflow->remote_key = mp_opt.sndr_key;
......@@ -638,7 +641,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
* reordered MPC will cause fallback, but we don't have other
* options.
*/
mptcp_get_options(skb, &mp_opt);
mptcp_get_options(sk, skb, &mp_opt);
if (!mp_opt.mp_capable) {
fallback = true;
goto create_child;
......@@ -648,7 +651,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
if (!new_msk)
fallback = true;
} else if (subflow_req->mp_join) {
mptcp_get_options(skb, &mp_opt);
mptcp_get_options(sk, skb, &mp_opt);
if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) ||
!mptcp_can_accept_new_subflow(subflow_req->msk)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
......@@ -824,10 +827,92 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
return true;
}
static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb,
bool csum_reqd)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct csum_pseudo_header header;
u32 offset, seq, delta;
__wsum csum;
int len;
if (!csum_reqd)
return MAPPING_OK;
/* mapping already validated on previous traversal */
if (subflow->map_csum_len == subflow->map_data_len)
return MAPPING_OK;
/* traverse the receive queue, ensuring it contains a full
* DSS mapping and accumulating the related csum.
* Preserve the accoumlate csum across multiple calls, to compute
* the csum only once
*/
delta = subflow->map_data_len - subflow->map_csum_len;
for (;;) {
seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len;
offset = seq - TCP_SKB_CB(skb)->seq;
/* if the current skb has not been accounted yet, csum its contents
* up to the amount covered by the current DSS
*/
if (offset < skb->len) {
__wsum csum;
len = min(skb->len - offset, delta);
csum = skb_checksum(skb, offset, len, 0);
subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum,
subflow->map_csum_len);
delta -= len;
subflow->map_csum_len += len;
}
if (delta == 0)
break;
if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) {
/* if this subflow is closed, the partial mapping
* will be never completed; flush the pending skbs, so
* that subflow_sched_work_if_closed() can kick in
*/
if (unlikely(ssk->sk_state == TCP_CLOSE))
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
/* not enough data to validate the csum */
return MAPPING_EMPTY;
}
/* the DSS mapping for next skbs will be validated later,
* when a get_mapping_status call will process such skb
*/
skb = skb->next;
}
/* note that 'map_data_len' accounts only for the carried data, does
* not include the eventual seq increment due to the data fin,
* while the pseudo header requires the original DSS data len,
* including that
*/
header.data_seq = cpu_to_be64(subflow->map_seq);
header.subflow_seq = htonl(subflow->map_subflow_seq);
header.data_len = htons(subflow->map_data_len + subflow->map_data_fin);
header.csum = 0;
csum = csum_partial(&header, sizeof(header), subflow->map_data_csum);
if (unlikely(csum_fold(csum))) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY;
}
return MAPPING_OK;
}
static enum mapping_status get_mapping_status(struct sock *ssk,
struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool csum_reqd = READ_ONCE(msk->csum_enabled);
struct mptcp_ext *mpext;
struct sk_buff *skb;
u16 data_len;
......@@ -920,9 +1005,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
/* Allow replacing only with an identical map */
if (subflow->map_seq == map_seq &&
subflow->map_subflow_seq == mpext->subflow_seq &&
subflow->map_data_len == data_len) {
subflow->map_data_len == data_len &&
subflow->map_csum_reqd == mpext->csum_reqd) {
skb_ext_del(skb, SKB_EXT_MPTCP);
return MAPPING_OK;
goto validate_csum;
}
/* If this skb data are fully covered by the current mapping,
......@@ -934,17 +1020,27 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
}
/* will validate the next map after consuming the current one */
return MAPPING_OK;
goto validate_csum;
}
subflow->map_seq = map_seq;
subflow->map_subflow_seq = mpext->subflow_seq;
subflow->map_data_len = data_len;
subflow->map_valid = 1;
subflow->map_data_fin = mpext->data_fin;
subflow->mpc_map = mpext->mpc_map;
pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
subflow->map_csum_reqd = mpext->csum_reqd;
subflow->map_csum_len = 0;
subflow->map_data_csum = csum_unfold(mpext->csum);
/* Cfr RFC 8684 Section 3.3.0 */
if (unlikely(subflow->map_csum_reqd != csum_reqd))
return MAPPING_INVALID;
pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
subflow->map_seq, subflow->map_subflow_seq,
subflow->map_data_len);
subflow->map_data_len, subflow->map_csum_reqd,
subflow->map_data_csum);
validate_seq:
/* we revalidate valid mapping on new skb, because we must ensure
......@@ -954,7 +1050,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
return MAPPING_INVALID;
skb_ext_del(skb, SKB_EXT_MPTCP);
return MAPPING_OK;
validate_csum:
return validate_data_csum(ssk, skb, csum_reqd);
}
static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
......
......@@ -3,7 +3,7 @@
time_start=$(date +%s)
optstring="S:R:d:e:l:r:h4cm:f:t"
optstring="S:R:d:e:l:r:h4cm:f:tC"
ret=0
sin=""
sout=""
......@@ -22,6 +22,7 @@ sndbuf=0
rcvbuf=0
options_log=true
do_tcp=0
checksum=false
filesize=0
if [ $tc_loss -eq 100 ];then
......@@ -47,6 +48,7 @@ usage() {
echo -e "\t-R: set rcvbuf value (default: use kernel default)"
echo -e "\t-m: test mode (poll, sendfile; default: poll)"
echo -e "\t-t: also run tests with TCP (use twice to non-fallback tcp)"
echo -e "\t-C: enable the MPTCP data checksum"
}
while getopts "$optstring" option;do
......@@ -104,6 +106,9 @@ while getopts "$optstring" option;do
"t")
do_tcp=$((do_tcp+1))
;;
"C")
checksum=true
;;
"?")
usage $0
exit 1
......@@ -200,6 +205,12 @@ ip -net "$ns4" route add default via dead:beef:3::2
# use TCP syn cookies, even if no flooding was detected.
ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2
if $checksum; then
for i in "$ns1" "$ns2" "$ns3" "$ns4";do
ip netns exec $i sysctl -q net.mptcp.checksum_enabled=1
done
fi
set_ethtool_flags() {
local ns="$1"
local dev="$2"
......
......@@ -12,6 +12,7 @@ timeout_poll=30
timeout_test=$((timeout_poll * 2 + 1))
mptcp_connect=""
capture=0
checksum=0
do_all_tests=1
TEST_COUNT=0
......@@ -49,6 +50,9 @@ init()
ip netns exec $netns sysctl -q net.mptcp.enabled=1
ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0
ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0
if [ $checksum -eq 1 ]; then
ip netns exec $netns sysctl -q net.mptcp.checksum_enabled=1
fi
done
# ns1 ns2
......@@ -124,6 +128,17 @@ reset_with_add_addr_timeout()
-j DROP
}
reset_with_checksum()
{
local ns1_enable=$1
local ns2_enable=$2
reset
ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable
ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable
}
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
......@@ -476,6 +491,45 @@ run_tests()
fi
}
chk_csum_nr()
{
local msg=${1:-""}
local count
local dump_stats
if [ ! -z "$msg" ]; then
printf "%02u" "$TEST_COUNT"
else
echo -n " "
fi
printf " %-36s %s" "$msg" "sum"
count=`ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'`
[ -z "$count" ] && count=0
if [ "$count" != 0 ]; then
echo "[fail] got $count data checksum error[s] expected 0"
ret=1
dump_stats=1
else
echo -n "[ ok ]"
fi
echo -n " - csum "
count=`ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'`
[ -z "$count" ] && count=0
if [ "$count" != 0 ]; then
echo "[fail] got $count data checksum error[s] expected 0"
ret=1
dump_stats=1
else
echo "[ ok ]"
fi
if [ "${dump_stats}" = 1 ]; then
echo Server ns stats
ip netns exec $ns1 nstat -as | grep MPTcp
echo Client ns stats
ip netns exec $ns2 nstat -as | grep MPTcp
fi
}
chk_join_nr()
{
local msg="$1"
......@@ -523,6 +577,9 @@ chk_join_nr()
echo Client ns stats
ip netns exec $ns2 nstat -as | grep MPTcp
fi
if [ $checksum -eq 1 ]; then
chk_csum_nr
fi
}
chk_add_nr()
......@@ -1374,6 +1431,37 @@ syncookies_tests()
chk_add_nr 1 1
}
checksum_tests()
{
# checksum test 0 0
reset_with_checksum 0 0
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
run_tests $ns1 $ns2 10.0.1.1
chk_csum_nr "checksum test 0 0"
# checksum test 1 1
reset_with_checksum 1 1
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
run_tests $ns1 $ns2 10.0.1.1
chk_csum_nr "checksum test 1 1"
# checksum test 0 1
reset_with_checksum 0 1
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
run_tests $ns1 $ns2 10.0.1.1
chk_csum_nr "checksum test 0 1"
# checksum test 1 0
reset_with_checksum 1 0
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
run_tests $ns1 $ns2 10.0.1.1
chk_csum_nr "checksum test 1 0"
}
all_tests()
{
subflows_tests
......@@ -1387,6 +1475,7 @@ all_tests()
backup_tests
add_addr_ports_tests
syncookies_tests
checksum_tests
}
usage()
......@@ -1403,7 +1492,9 @@ usage()
echo " -b backup_tests"
echo " -p add_addr_ports_tests"
echo " -k syncookies_tests"
echo " -S checksum_tests"
echo " -c capture pcap files"
echo " -C enable data checksum"
echo " -h help"
}
......@@ -1418,13 +1509,16 @@ make_file "$sin" "server" 1
trap cleanup EXIT
for arg in "$@"; do
# check for "capture" arg before launching tests
# check for "capture/checksum" args before launching tests
if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then
capture=1
fi
if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"C"[0-9a-zA-Z]*$ ]]; then
checksum=1
fi
# exception for the capture option, the rest means: a part of the tests
if [ "${arg}" != "-c" ]; then
# exception for the capture/checksum options, the rest means: a part of the tests
if [ "${arg}" != "-c" ] && [ "${arg}" != "-C" ]; then
do_all_tests=0
fi
done
......@@ -1434,7 +1528,7 @@ if [ $do_all_tests -eq 1 ]; then
exit $ret
fi
while getopts 'fsltra64bpkch' opt; do
while getopts 'fsltra64bpkchCS' opt; do
case $opt in
f)
subflows_tests
......@@ -1469,8 +1563,13 @@ while getopts 'fsltra64bpkch' opt; do
k)
syncookies_tests
;;
S)
checksum_tests
;;
c)
;;
C)
;;
h | *)
usage
;;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment