Commit c189b548 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-multiple-subflows-path-management'

Mat Martineau says:

====================
Multipath TCP part 3: Multiple subflows and path management

v2 -> v3: Remove 'inline' in .c files, fix uapi bit macros, and rebase.

v1 -> v2: Rebase on current net-next, fix for netlink limit setting,
and update .gitignore for selftest.

This patch set allows more than one TCP subflow to be established and
used for a multipath TCP connection. Subflows are added to an existing
connection using the MP_JOIN option during the 3-way handshake. With
multiple TCP subflows available, sent data is now stored in the MPTCP
socket so it may be retransmitted on any TCP subflow if there is no
DATA_ACK before a timeout. If an MPTCP-level timeout occurs, data is
retransmitted using an available subflow. Storing this sent data
requires the addition of memory accounting at the MPTCP level, which was
previously delegated to the single subflow. Incoming DATA_ACKs now free
data from the MPTCP-level retransmit buffer.

IP addresses available for new subflow connections can now be advertised
and received with the ADD_ADDR option, and the corresponding REMOVE_ADDR
option likewise advertises that an address is no longer available.

The MPTCP path manager netlink interface has commands to set in-kernel
limits for the number of concurrent subflows and control the
advertisement of IP addresses between peers.

To track and debug MPTCP connections there are new MPTCP MIB counters,
and subflow context can be requested using inet_diag. The MPTCP
self-tests now validate multiple-subflow operation and the netlink path
manager interface.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 41b14502 b08fbf24
...@@ -11727,6 +11727,7 @@ W: https://github.com/multipath-tcp/mptcp_net-next/wiki ...@@ -11727,6 +11727,7 @@ W: https://github.com/multipath-tcp/mptcp_net-next/wiki
B: https://github.com/multipath-tcp/mptcp_net-next/issues B: https://github.com/multipath-tcp/mptcp_net-next/issues
S: Maintained S: Maintained
F: include/net/mptcp.h F: include/net/mptcp.h
F: include/uapi/linux/mptcp.h
F: net/mptcp/ F: net/mptcp/
F: tools/testing/selftests/net/mptcp/ F: tools/testing/selftests/net/mptcp/
......
...@@ -86,9 +86,19 @@ struct mptcp_options_received { ...@@ -86,9 +86,19 @@ struct mptcp_options_received {
u64 data_seq; u64 data_seq;
u32 subflow_seq; u32 subflow_seq;
u16 data_len; u16 data_len;
u8 mp_capable : 1, u16 mp_capable : 1,
mp_join : 1, mp_join : 1,
dss : 1; dss : 1,
add_addr : 1,
rm_addr : 1,
family : 4,
echo : 1,
backup : 1;
u32 token;
u32 nonce;
u64 thmac;
u8 hmac[20];
u8 join_id;
u8 use_map:1, u8 use_map:1,
dsn64:1, dsn64:1,
data_fin:1, data_fin:1,
...@@ -96,6 +106,16 @@ struct mptcp_options_received { ...@@ -96,6 +106,16 @@ struct mptcp_options_received {
ack64:1, ack64:1,
mpc_map:1, mpc_map:1,
__unused:2; __unused:2;
u8 addr_id;
u8 rm_id;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct in6_addr addr6;
#endif
};
u64 ahmac;
u16 port;
}; };
#endif #endif
...@@ -131,6 +151,8 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) ...@@ -131,6 +151,8 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
#if IS_ENABLED(CONFIG_MPTCP) #if IS_ENABLED(CONFIG_MPTCP)
rx_opt->mptcp.mp_capable = 0; rx_opt->mptcp.mp_capable = 0;
rx_opt->mptcp.mp_join = 0; rx_opt->mptcp.mp_join = 0;
rx_opt->mptcp.add_addr = 0;
rx_opt->mptcp.rm_addr = 0;
rx_opt->mptcp.dss = 0; rx_opt->mptcp.dss = 0;
#endif #endif
} }
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
#include <linux/tcp.h> #include <linux/tcp.h>
#include <linux/types.h> #include <linux/types.h>
struct seq_file;
/* MPTCP sk_buff extension data */ /* MPTCP sk_buff extension data */
struct mptcp_ext { struct mptcp_ext {
u64 data_ack; u64 data_ack;
...@@ -33,6 +35,21 @@ struct mptcp_out_options { ...@@ -33,6 +35,21 @@ struct mptcp_out_options {
u16 suboptions; u16 suboptions;
u64 sndr_key; u64 sndr_key;
u64 rcvr_key; u64 rcvr_key;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct in6_addr addr6;
#endif
};
u8 addr_id;
u64 ahmac;
u8 rm_id;
u8 join_id;
u8 backup;
u32 nonce;
u64 thmac;
u32 token;
u8 hmac[20];
struct mptcp_ext ext_copy; struct mptcp_ext ext_copy;
#endif #endif
}; };
...@@ -106,6 +123,9 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, ...@@ -106,6 +123,9 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
skb_ext_find(from, SKB_EXT_MPTCP)); skb_ext_find(from, SKB_EXT_MPTCP));
} }
bool mptcp_sk_is_subflow(const struct sock *sk);
void mptcp_seq_show(struct seq_file *seq);
#else #else
static inline void mptcp_init(void) static inline void mptcp_init(void)
...@@ -172,6 +192,12 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, ...@@ -172,6 +192,12 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
return true; return true;
} }
static inline bool mptcp_sk_is_subflow(const struct sock *sk)
{
return false;
}
static inline void mptcp_seq_show(struct seq_file *seq) { }
#endif /* CONFIG_MPTCP */ #endif /* CONFIG_MPTCP */
#if IS_ENABLED(CONFIG_MPTCP_IPV6) #if IS_ENABLED(CONFIG_MPTCP_IPV6)
......
...@@ -27,6 +27,9 @@ struct netns_mib { ...@@ -27,6 +27,9 @@ struct netns_mib {
#if IS_ENABLED(CONFIG_TLS) #if IS_ENABLED(CONFIG_TLS)
DEFINE_SNMP_STAT(struct linux_tls_mib, tls_statistics); DEFINE_SNMP_STAT(struct linux_tls_mib, tls_statistics);
#endif #endif
#ifdef CONFIG_MPTCP
DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics);
#endif
}; };
#endif #endif
...@@ -166,6 +166,7 @@ enum { ...@@ -166,6 +166,7 @@ enum {
INET_ULP_INFO_UNSPEC, INET_ULP_INFO_UNSPEC,
INET_ULP_INFO_NAME, INET_ULP_INFO_NAME,
INET_ULP_INFO_TLS, INET_ULP_INFO_TLS,
INET_ULP_INFO_MPTCP,
__INET_ULP_INFO_MAX, __INET_ULP_INFO_MAX,
}; };
#define INET_ULP_INFO_MAX (__INET_ULP_INFO_MAX - 1) #define INET_ULP_INFO_MAX (__INET_ULP_INFO_MAX - 1)
......
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _UAPI_MPTCP_H
#define _UAPI_MPTCP_H
#include <linux/const.h>
#include <linux/types.h>
#define MPTCP_SUBFLOW_FLAG_MCAP_REM _BITUL(0)
#define MPTCP_SUBFLOW_FLAG_MCAP_LOC _BITUL(1)
#define MPTCP_SUBFLOW_FLAG_JOIN_REM _BITUL(2)
#define MPTCP_SUBFLOW_FLAG_JOIN_LOC _BITUL(3)
#define MPTCP_SUBFLOW_FLAG_BKUP_REM _BITUL(4)
#define MPTCP_SUBFLOW_FLAG_BKUP_LOC _BITUL(5)
#define MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED _BITUL(6)
#define MPTCP_SUBFLOW_FLAG_CONNECTED _BITUL(7)
#define MPTCP_SUBFLOW_FLAG_MAPVALID _BITUL(8)
enum {
MPTCP_SUBFLOW_ATTR_UNSPEC,
MPTCP_SUBFLOW_ATTR_TOKEN_REM,
MPTCP_SUBFLOW_ATTR_TOKEN_LOC,
MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
MPTCP_SUBFLOW_ATTR_MAP_SEQ,
MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
MPTCP_SUBFLOW_ATTR_SSN_OFFSET,
MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
MPTCP_SUBFLOW_ATTR_FLAGS,
MPTCP_SUBFLOW_ATTR_ID_REM,
MPTCP_SUBFLOW_ATTR_ID_LOC,
MPTCP_SUBFLOW_ATTR_PAD,
__MPTCP_SUBFLOW_ATTR_MAX
};
#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1)
/* netlink interface */
#define MPTCP_PM_NAME "mptcp_pm"
#define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds"
#define MPTCP_PM_VER 0x1
/*
* ATTR types defined for MPTCP
*/
enum {
MPTCP_PM_ATTR_UNSPEC,
MPTCP_PM_ATTR_ADDR, /* nested address */
MPTCP_PM_ATTR_RCV_ADD_ADDRS, /* u32 */
MPTCP_PM_ATTR_SUBFLOWS, /* u32 */
__MPTCP_PM_ATTR_MAX
};
#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1)
enum {
MPTCP_PM_ADDR_ATTR_UNSPEC,
MPTCP_PM_ADDR_ATTR_FAMILY, /* u16 */
MPTCP_PM_ADDR_ATTR_ID, /* u8 */
MPTCP_PM_ADDR_ATTR_ADDR4, /* struct in_addr */
MPTCP_PM_ADDR_ATTR_ADDR6, /* struct in6_addr */
MPTCP_PM_ADDR_ATTR_PORT, /* u16 */
MPTCP_PM_ADDR_ATTR_FLAGS, /* u32 */
MPTCP_PM_ADDR_ATTR_IF_IDX, /* s32 */
__MPTCP_PM_ADDR_ATTR_MAX
};
#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1)
#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0)
#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1)
#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2)
enum {
MPTCP_PM_CMD_UNSPEC,
MPTCP_PM_CMD_ADD_ADDR,
MPTCP_PM_CMD_DEL_ADDR,
MPTCP_PM_CMD_GET_ADDR,
MPTCP_PM_CMD_FLUSH_ADDRS,
MPTCP_PM_CMD_SET_LIMITS,
MPTCP_PM_CMD_GET_LIMITS,
__MPTCP_PM_CMD_AFTER_LAST
};
#endif /* _UAPI_MPTCP_H */
...@@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) ...@@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
free_percpu(net->mib.net_statistics); free_percpu(net->mib.net_statistics);
free_percpu(net->mib.ip_statistics); free_percpu(net->mib.ip_statistics);
free_percpu(net->mib.tcp_statistics); free_percpu(net->mib.tcp_statistics);
#ifdef CONFIG_MPTCP
/* allocated on demand, see mptcp_init_sock() */
free_percpu(net->mib.mptcp_statistics);
#endif
} }
static __net_initdata struct pernet_operations ipv4_mib_ops = { static __net_initdata struct pernet_operations ipv4_mib_ops = {
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <net/icmp.h> #include <net/icmp.h>
#include <net/protocol.h> #include <net/protocol.h>
#include <net/tcp.h> #include <net/tcp.h>
#include <net/mptcp.h>
#include <net/udp.h> #include <net/udp.h>
#include <net/udplite.h> #include <net/udplite.h>
#include <linux/bottom_half.h> #include <linux/bottom_half.h>
...@@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) ...@@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
offsetof(struct ipstats_mib, syncp))); offsetof(struct ipstats_mib, syncp)));
seq_putc(seq, '\n'); seq_putc(seq, '\n');
mptcp_seq_show(seq);
return 0; return 0;
} }
......
...@@ -774,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -774,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child) if (!child)
goto listen_overflow; goto listen_overflow;
if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) {
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
inet_csk_reqsk_queue_drop_and_put(sk, req);
return child;
}
sock_rps_save_rxhash(child, skb); sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req); tcp_synack_rtt_meas(child, req);
*req_stolen = !own_req; *req_stolen = !own_req;
......
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_MPTCP) += mptcp.o obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
mib.o pm_netlink.o
...@@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) ...@@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
*idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
} }
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
void *hmac)
{ {
u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
__be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
...@@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, ...@@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
u8 key2be[8]; u8 key2be[8];
int i; int i;
if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
len = SHA256_DIGEST_SIZE;
put_unaligned_be64(key1, key1be); put_unaligned_be64(key1, key1be);
put_unaligned_be64(key2, key2be); put_unaligned_be64(key2, key2be);
...@@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, ...@@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
input[i + 8] ^= key2be[i]; input[i + 8] ^= key2be[i];
put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]);
sha256_init(&state); sha256_init(&state);
sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); sha256_update(&state, input, SHA256_BLOCK_SIZE + len);
/* emit sha256(K1 || msg) on the second input block, so we can /* emit sha256(K1 || msg) on the second input block, so we can
* reuse 'input' for the last hashing * reuse 'input' for the last hashing
...@@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void) ...@@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void)
char hmac[20], hmac_hex[41]; char hmac[20], hmac_hex[41];
u32 nonce1, nonce2; u32 nonce1, nonce2;
u64 key1, key2; u64 key1, key2;
u8 msg[8];
int i, j; int i, j;
for (i = 0; i < ARRAY_SIZE(tests); ++i) { for (i = 0; i < ARRAY_SIZE(tests); ++i) {
...@@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void) ...@@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void)
nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); put_unaligned_be32(nonce1, &msg[0]);
put_unaligned_be32(nonce2, &msg[4]);
mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
for (j = 0; j < 20; ++j) for (j = 0; j < 20; ++j)
sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
hmac_hex[40] = 0; hmac_hex[40] = 0;
......
// SPDX-License-Identifier: GPL-2.0
/* MPTCP socket monitoring support
*
* Copyright (c) 2019 Red Hat
*
* Author: Davide Caratti <dcaratti@redhat.com>
*/
#include <linux/kernel.h>
#include <linux/net.h>
#include <linux/inet_diag.h>
#include <net/netlink.h>
#include <uapi/linux/mptcp.h>
#include "protocol.h"
static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
{
struct mptcp_subflow_context *sf;
struct nlattr *start;
u32 flags = 0;
int err;
start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
if (!start)
return -EMSGSIZE;
rcu_read_lock();
sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
if (!sf) {
err = 0;
goto nla_failure;
}
if (sf->mp_capable)
flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM;
if (sf->request_mptcp)
flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC;
if (sf->mp_join)
flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM;
if (sf->request_join)
flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC;
if (sf->backup)
flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM;
if (sf->request_bkup)
flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC;
if (sf->fully_established)
flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED;
if (sf->conn_finished)
flags |= MPTCP_SUBFLOW_FLAG_CONNECTED;
if (sf->map_valid)
flags |= MPTCP_SUBFLOW_FLAG_MAPVALID;
if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) ||
nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) ||
nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
sf->rel_write_seq) ||
nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq,
MPTCP_SUBFLOW_ATTR_PAD) ||
nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
sf->map_subflow_seq) ||
nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) ||
nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
sf->map_data_len) ||
nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) ||
nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) ||
nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) {
err = -EMSGSIZE;
goto nla_failure;
}
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
nla_failure:
rcu_read_unlock();
nla_nest_cancel(skb, start);
return err;
}
static size_t subflow_get_info_size(const struct sock *sk)
{
size_t size = 0;
size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */
0;
return size;
}
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops)
{
ops->get_info = subflow_get_info;
ops->get_info_size = subflow_get_info_size;
}
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/seq_file.h>
#include <net/ip.h>
#include <net/mptcp.h>
#include <net/snmp.h>
#include <net/net_namespace.h>
#include "mib.h"
static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
SNMP_MIB_SENTINEL
};
/* mptcp_mib_alloc - allocate percpu mib counters
*
* These are allocated when the first mptcp socket is created so
* we do not waste percpu memory if mptcp isn't in use.
*/
bool mptcp_mib_alloc(struct net *net)
{
struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib);
if (!mib)
return false;
if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib))
free_percpu(mib);
return true;
}
void mptcp_seq_show(struct seq_file *seq)
{
struct net *net = seq->private;
int i;
seq_puts(seq, "MPTcpExt:");
for (i = 0; mptcp_snmp_list[i].name; i++)
seq_printf(seq, " %s", mptcp_snmp_list[i].name);
seq_puts(seq, "\nMPTcpExt:");
if (!net->mib.mptcp_statistics) {
for (i = 0; mptcp_snmp_list[i].name; i++)
seq_puts(seq, " 0");
return;
}
for (i = 0; mptcp_snmp_list[i].name; i++)
seq_printf(seq, " %lu",
snmp_fold_field(net->mib.mptcp_statistics,
mptcp_snmp_list[i].entry));
seq_putc(seq, '\n');
}
/* SPDX-License-Identifier: GPL-2.0-or-later */
enum linux_mptcp_mib_field {
MPTCP_MIB_NUM = 0,
MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
__MPTCP_MIB_MAX
};
#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX
struct mptcp_mib {
unsigned long mibs[LINUX_MIB_MPTCP_MAX];
};
static inline void MPTCP_INC_STATS(struct net *net,
enum linux_mptcp_mib_field field)
{
if (likely(net->mib.mptcp_statistics))
SNMP_INC_STATS(net->mib.mptcp_statistics, field);
}
static inline void __MPTCP_INC_STATS(struct net *net,
enum linux_mptcp_mib_field field)
{
if (likely(net->mib.mptcp_statistics))
__SNMP_INC_STATS(net->mib.mptcp_statistics, field);
}
bool mptcp_mib_alloc(struct net *net);
...@@ -96,6 +96,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, ...@@ -96,6 +96,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
mp_opt->rcvr_key, mp_opt->data_len); mp_opt->rcvr_key, mp_opt->data_len);
break; break;
case MPTCPOPT_MP_JOIN:
mp_opt->mp_join = 1;
if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
mp_opt->join_id = *ptr++;
mp_opt->token = get_unaligned_be32(ptr);
ptr += 4;
mp_opt->nonce = get_unaligned_be32(ptr);
ptr += 4;
pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
mp_opt->backup, mp_opt->join_id,
mp_opt->token, mp_opt->nonce);
} else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
mp_opt->join_id = *ptr++;
mp_opt->thmac = get_unaligned_be64(ptr);
ptr += 8;
mp_opt->nonce = get_unaligned_be32(ptr);
ptr += 4;
pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
mp_opt->backup, mp_opt->join_id,
mp_opt->thmac, mp_opt->nonce);
} else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
ptr += 2;
memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
pr_debug("MP_JOIN hmac");
} else {
pr_warn("MP_JOIN bad option size");
mp_opt->mp_join = 0;
}
break;
case MPTCPOPT_DSS: case MPTCPOPT_DSS:
pr_debug("DSS"); pr_debug("DSS");
ptr++; ptr++;
...@@ -178,6 +210,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, ...@@ -178,6 +210,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
break; break;
case MPTCPOPT_ADD_ADDR:
mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
if (!mp_opt->echo) {
if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
mp_opt->family = MPTCP_ADDR_IPVERSION_4;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
mp_opt->family = MPTCP_ADDR_IPVERSION_6;
#endif
else
break;
} else {
if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
mp_opt->family = MPTCP_ADDR_IPVERSION_4;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
mp_opt->family = MPTCP_ADDR_IPVERSION_6;
#endif
else
break;
}
mp_opt->add_addr = 1;
mp_opt->port = 0;
mp_opt->addr_id = *ptr++;
pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
ptr += 4;
if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
mp_opt->port = get_unaligned_be16(ptr);
ptr += 2;
}
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else {
memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
ptr += 16;
if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
mp_opt->port = get_unaligned_be16(ptr);
ptr += 2;
}
}
#endif
if (!mp_opt->echo) {
mp_opt->ahmac = get_unaligned_be64(ptr);
ptr += 8;
}
break;
case MPTCPOPT_RM_ADDR:
if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
break;
mp_opt->rm_addr = 1;
mp_opt->rm_id = *ptr++;
pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
break;
default: default:
break; break;
} }
...@@ -231,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, ...@@ -231,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
opts->sndr_key = subflow->local_key; opts->sndr_key = subflow->local_key;
*size = TCPOLEN_MPTCP_MPC_SYN; *size = TCPOLEN_MPTCP_MPC_SYN;
return true; return true;
} else if (subflow->request_join) {
pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
subflow->local_nonce);
opts->suboptions = OPTION_MPTCP_MPJ_SYN;
opts->join_id = subflow->local_id;
opts->token = subflow->remote_token;
opts->nonce = subflow->local_nonce;
opts->backup = subflow->request_bkup;
*size = TCPOLEN_MPTCP_MPJ_SYN;
return true;
} }
return false; return false;
} }
...@@ -240,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk) ...@@ -240,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
pr_debug("subflow=%p", subflow);
if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
subflow->mp_capable = 1; subflow->mp_capable = 1;
subflow->can_ack = 1; subflow->can_ack = 1;
subflow->remote_key = tp->rx_opt.mptcp.sndr_key; subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
} else { pr_debug("subflow=%p, remote_key=%llu", subflow,
subflow->remote_key);
} else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
subflow->mp_join = 1;
subflow->thmac = tp->rx_opt.mptcp.thmac;
subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
subflow->thmac, subflow->remote_nonce);
} else if (subflow->request_mptcp) {
tcp_sk(sk)->is_mptcp = 0; tcp_sk(sk)->is_mptcp = 0;
} }
} }
/* MP_JOIN client subflow must wait for 4th ack before sending any data:
* TCP can't schedule delack timer before the subflow is fully established.
* MPTCP uses the delack timer to do 3rd ack retransmissions
*/
static void schedule_3rdack_retransmission(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned long timeout;
/* reschedule with a timeout above RTT, as we must look only for drop */
if (tp->srtt_us)
timeout = tp->srtt_us << 1;
else
timeout = TCP_TIMEOUT_INIT;
WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
static void clear_3rdack_retransmission(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
icsk->icsk_ack.timeout = 0;
icsk->icsk_ack.ato = 0;
icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
}
static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int *size,
unsigned int remaining, unsigned int remaining,
...@@ -259,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, ...@@ -259,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
struct mptcp_ext *mpext; struct mptcp_ext *mpext;
unsigned int data_len; unsigned int data_len;
pr_debug("subflow=%p fully established=%d seq=%x:%x remaining=%d", /* When skb is not available, we better over-estimate the emitted
subflow, subflow->fully_established, subflow->snd_isn, * options len. A full DSS option (28 bytes) is longer than
skb ? TCP_SKB_CB(skb)->seq : 0, remaining); * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
* tell the caller to defer the estimate to
if (subflow->mp_capable && !subflow->fully_established && skb && * mptcp_established_options_dss(), which will reserve enough space.
subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
/* When skb is not available, we better over-estimate the
* emitted options len. A full DSS option is longer than
* TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
* that.
*/ */
if (!skb)
return false;
/* MPC/MPJ needed only on 3rd ack packet */
if (subflow->fully_established ||
subflow->snd_isn != TCP_SKB_CB(skb)->seq)
return false;
if (subflow->mp_capable) {
mpext = mptcp_get_ext(skb); mpext = mptcp_get_ext(skb);
data_len = mpext ? mpext->data_len : 0; data_len = mpext ? mpext->data_len : 0;
...@@ -297,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, ...@@ -297,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
data_len); data_len);
return true; return true;
} else if (subflow->mp_join) {
opts->suboptions = OPTION_MPTCP_MPJ_ACK;
memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
*size = TCPOLEN_MPTCP_MPJ_ACK;
pr_debug("subflow=%p", subflow);
schedule_3rdack_retransmission(sk);
return true;
} }
return false; return false;
} }
...@@ -386,6 +544,83 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, ...@@ -386,6 +544,83 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return true; return true;
} }
static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
struct in_addr *addr)
{
u8 hmac[MPTCP_ADDR_HMAC_LEN];
u8 msg[7];
msg[0] = addr_id;
memcpy(&msg[1], &addr->s_addr, 4);
msg[5] = 0;
msg[6] = 0;
mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
return get_unaligned_be64(hmac);
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
struct in6_addr *addr)
{
u8 hmac[MPTCP_ADDR_HMAC_LEN];
u8 msg[19];
msg[0] = addr_id;
memcpy(&msg[1], &addr->s6_addr, 16);
msg[17] = 0;
msg[18] = 0;
mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
return get_unaligned_be64(hmac);
}
#endif
static bool mptcp_established_options_addr(struct sock *sk,
unsigned int *size,
unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_addr_info saddr;
int len;
if (!mptcp_pm_should_signal(msk) ||
!(mptcp_pm_addr_signal(msk, remaining, &saddr)))
return false;
len = mptcp_add_addr_len(saddr.family);
if (remaining < len)
return false;
*size = len;
opts->addr_id = saddr.id;
if (saddr.family == AF_INET) {
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
opts->addr = saddr.addr;
opts->ahmac = add_addr_generate_hmac(msk->local_key,
msk->remote_key,
opts->addr_id,
&opts->addr);
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (saddr.family == AF_INET6) {
opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
opts->addr6 = saddr.addr6;
opts->ahmac = add_addr6_generate_hmac(msk->local_key,
msk->remote_key,
opts->addr_id,
&opts->addr6);
}
#endif
pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
return true;
}
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining, unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts) struct mptcp_out_options *opts)
...@@ -393,6 +628,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, ...@@ -393,6 +628,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int opt_size = 0; unsigned int opt_size = 0;
bool ret = false; bool ret = false;
opts->suboptions = 0;
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true; ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
...@@ -407,6 +644,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, ...@@ -407,6 +644,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
*size += opt_size; *size += opt_size;
remaining -= opt_size; remaining -= opt_size;
if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
*size += opt_size;
remaining -= opt_size;
ret = true;
}
return ret; return ret;
} }
...@@ -423,54 +665,194 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, ...@@ -423,54 +665,194 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
pr_debug("subflow_req=%p, local_key=%llu", pr_debug("subflow_req=%p, local_key=%llu",
subflow_req, subflow_req->local_key); subflow_req, subflow_req->local_key);
return true; return true;
} else if (subflow_req->mp_join) {
opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
opts->backup = subflow_req->backup;
opts->join_id = subflow_req->local_id;
opts->thmac = subflow_req->thmac;
opts->nonce = subflow_req->local_nonce;
pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
subflow_req, opts->backup, opts->join_id,
opts->thmac, opts->nonce);
*size = TCPOLEN_MPTCP_MPJ_SYNACK;
return true;
} }
return false; return false;
} }
static bool check_fully_established(struct mptcp_subflow_context *subflow, static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
struct mptcp_subflow_context *subflow,
struct sk_buff *skb, struct sk_buff *skb,
struct mptcp_options_received *mp_opt) struct mptcp_options_received *mp_opt)
{ {
/* here we can process OoO, in-window pkts, only in-sequence 4th ack /* here we can process OoO, in-window pkts, only in-sequence 4th ack
* are relevant * will make the subflow fully established
*/ */
if (likely(subflow->fully_established || if (likely(subflow->fully_established)) {
TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) /* on passive sockets, check for 3rd ack retransmission
return true; * note that msk is always set by subflow_syn_recv_sock()
* for mp_join subflows
*/
if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
subflow->mp_join && mp_opt->mp_join &&
READ_ONCE(msk->pm.server_side))
tcp_send_ack(sk);
goto fully_established;
}
if (mp_opt->use_ack) /* we should process OoO packets before the first subflow is fully
* established, but not expected for MP_JOIN subflows
*/
if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
return subflow->mp_capable;
if (mp_opt->use_ack) {
/* subflows are fully established as soon as we get any
* additional ack.
*/
subflow->fully_established = 1; subflow->fully_established = 1;
goto fully_established;
}
if (subflow->can_ack) WARN_ON_ONCE(subflow->can_ack);
return true;
/* If the first established packet does not contain MP_CAPABLE + data /* If the first established packet does not contain MP_CAPABLE + data
* then fallback to TCP * then fallback to TCP
*/ */
if (!mp_opt->mp_capable) { if (!mp_opt->mp_capable) {
subflow->mp_capable = 0; subflow->mp_capable = 0;
tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; tcp_sk(sk)->is_mptcp = 0;
return false; return false;
} }
subflow->fully_established = 1;
subflow->remote_key = mp_opt->sndr_key; subflow->remote_key = mp_opt->sndr_key;
subflow->can_ack = 1; subflow->can_ack = 1;
fully_established:
if (likely(subflow->pm_notified))
return true;
subflow->pm_notified = 1;
if (subflow->mp_join) {
clear_3rdack_retransmission(sk);
mptcp_pm_subflow_established(msk, subflow);
} else {
mptcp_pm_fully_established(msk);
}
return true;
}
static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
{
u32 old_ack32, cur_ack32;
if (use_64bit)
return cur_ack;
old_ack32 = (u32)old_ack;
cur_ack32 = (u32)cur_ack;
cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
if (unlikely(before(cur_ack32, old_ack32)))
return cur_ack + (1LL << 32);
return cur_ack;
}
static void update_una(struct mptcp_sock *msk,
struct mptcp_options_received *mp_opt)
{
u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
u64 write_seq = READ_ONCE(msk->write_seq);
/* avoid ack expansion on update conflict, to reduce the risk of
* wrongly expanding to a future ack sequence number, which is way
* more dangerous than missing an ack
*/
new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
/* ACK for data not even sent yet? Ignore. */
if (after64(new_snd_una, write_seq))
new_snd_una = old_snd_una;
while (after64(new_snd_una, old_snd_una)) {
snd_una = old_snd_una;
old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
new_snd_una);
if (old_snd_una == snd_una) {
mptcp_data_acked((struct sock *)msk);
break;
}
}
}
static bool add_addr_hmac_valid(struct mptcp_sock *msk,
struct mptcp_options_received *mp_opt)
{
u64 hmac = 0;
if (mp_opt->echo)
return true; return true;
if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
hmac = add_addr_generate_hmac(msk->remote_key,
msk->local_key,
mp_opt->addr_id, &mp_opt->addr);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else
hmac = add_addr6_generate_hmac(msk->remote_key,
msk->local_key,
mp_opt->addr_id, &mp_opt->addr6);
#endif
pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
msk, (unsigned long long)hmac,
(unsigned long long)mp_opt->ahmac);
return hmac == mp_opt->ahmac;
} }
void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct tcp_options_received *opt_rx) struct tcp_options_received *opt_rx)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_options_received *mp_opt; struct mptcp_options_received *mp_opt;
struct mptcp_ext *mpext; struct mptcp_ext *mpext;
mp_opt = &opt_rx->mptcp; mp_opt = &opt_rx->mptcp;
if (!check_fully_established(subflow, skb, mp_opt)) if (!check_fully_established(msk, sk, subflow, skb, mp_opt))
return; return;
if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) {
struct mptcp_addr_info addr;
addr.port = htons(mp_opt->port);
addr.id = mp_opt->addr_id;
if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
addr.family = AF_INET;
addr.addr = mp_opt->addr;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) {
addr.family = AF_INET6;
addr.addr6 = mp_opt->addr6;
}
#endif
if (!mp_opt->echo)
mptcp_pm_add_addr_received(msk, &addr);
mp_opt->add_addr = 0;
}
if (!mp_opt->dss) if (!mp_opt->dss)
return; return;
/* we can't wait for recvmsg() to update the ack_seq, otherwise
* monodirectional flows will stuck
*/
if (mp_opt->use_ack)
update_una(msk, mp_opt);
mpext = skb_ext_add(skb, SKB_EXT_MPTCP); mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext) if (!mpext)
return; return;
...@@ -497,12 +879,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, ...@@ -497,12 +879,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
mpext->use_map = 1; mpext->use_map = 1;
} }
if (mp_opt->use_ack) {
mpext->data_ack = mp_opt->data_ack;
mpext->use_ack = 1;
mpext->ack64 = mp_opt->ack64;
}
mpext->data_fin = mp_opt->data_fin; mpext->data_fin = mp_opt->data_fin;
} }
...@@ -521,9 +897,8 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) ...@@ -521,9 +897,8 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
else else
len = TCPOLEN_MPTCP_MPC_ACK; len = TCPOLEN_MPTCP_MPC_ACK;
*ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
(MPTCPOPT_MP_CAPABLE << 12) | MPTCP_SUPPORTED_VERSION,
(MPTCP_SUPPORTED_VERSION << 8) |
MPTCP_CAP_HMAC_SHA256); MPTCP_CAP_HMAC_SHA256);
if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
...@@ -546,6 +921,77 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) ...@@ -546,6 +921,77 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
} }
mp_capable_done: mp_capable_done:
if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
if (opts->ahmac)
*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
TCPOLEN_MPTCP_ADD_ADDR, 0,
opts->addr_id);
else
*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
TCPOLEN_MPTCP_ADD_ADDR_BASE,
MPTCP_ADDR_ECHO,
opts->addr_id);
memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
ptr += 1;
if (opts->ahmac) {
put_unaligned_be64(opts->ahmac, ptr);
ptr += 2;
}
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
if (opts->ahmac)
*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
TCPOLEN_MPTCP_ADD_ADDR6, 0,
opts->addr_id);
else
*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
TCPOLEN_MPTCP_ADD_ADDR6_BASE,
MPTCP_ADDR_ECHO,
opts->addr_id);
memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
ptr += 4;
if (opts->ahmac) {
put_unaligned_be64(opts->ahmac, ptr);
ptr += 2;
}
}
#endif
if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
TCPOLEN_MPTCP_RM_ADDR_BASE,
0, opts->rm_id);
}
if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_SYN,
opts->backup, opts->join_id);
put_unaligned_be32(opts->token, ptr);
ptr += 1;
put_unaligned_be32(opts->nonce, ptr);
ptr += 1;
}
if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_SYNACK,
opts->backup, opts->join_id);
put_unaligned_be64(opts->thmac, ptr);
ptr += 2;
put_unaligned_be32(opts->nonce, ptr);
ptr += 1;
}
if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
ptr += 5;
}
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy; struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE; u8 len = TCPOLEN_MPTCP_DSS_BASE;
...@@ -567,10 +1013,7 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) ...@@ -567,10 +1013,7 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
flags |= MPTCP_DSS_DATA_FIN; flags |= MPTCP_DSS_DATA_FIN;
} }
*ptr++ = htonl((TCPOPT_MPTCP << 24) | *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
(len << 16) |
(MPTCPOPT_DSS << 12) |
(flags));
if (mpext->use_ack) { if (mpext->use_ack) {
put_unaligned_be64(mpext->data_ack, ptr); put_unaligned_be64(mpext->data_ack, ptr);
......
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2019, Intel Corporation.
*/
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
static struct workqueue_struct *pm_wq;
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
pr_debug("msk=%p, local_id=%d", msk, addr->id);
msk->pm.local = *addr;
WRITE_ONCE(msk->pm.addr_signal, true);
return 0;
}
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
{
return -ENOTSUPP;
}
int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id)
{
return -ENOTSUPP;
}
/* path manager event handlers */
void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
WRITE_ONCE(pm->server_side, server_side);
}
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
int ret;
pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
pm->subflows_max, READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->accept_subflow))
return false;
spin_lock_bh(&pm->lock);
ret = pm->subflows < pm->subflows_max;
if (ret && ++pm->subflows == pm->subflows_max)
WRITE_ONCE(pm->accept_subflow, false);
spin_unlock_bh(&pm->lock);
return ret;
}
/* return true if the new status bit is currently cleared, that is, this event
* can be server, eventually by an already scheduled work
*/
static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
enum mptcp_pm_status new_status)
{
pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
BIT(new_status));
if (msk->pm.status & BIT(new_status))
return false;
msk->pm.status |= BIT(new_status);
if (queue_work(pm_wq, &msk->pm.work))
sock_hold((struct sock *)msk);
return true;
}
void mptcp_pm_fully_established(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p", msk);
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->work_pending))
return;
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->work_pending))
mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
spin_unlock_bh(&pm->lock);
}
void mptcp_pm_connection_closed(struct mptcp_sock *msk)
{
pr_debug("msk=%p", msk);
}
void mptcp_pm_subflow_established(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p", msk);
if (!READ_ONCE(pm->work_pending))
return;
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->work_pending))
mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
spin_unlock_bh(&pm->lock);
}
void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
{
pr_debug("msk=%p", msk);
}
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
READ_ONCE(pm->accept_addr));
/* avoid acquiring the lock if there is no room for fouther addresses */
if (!READ_ONCE(pm->accept_addr))
return;
spin_lock_bh(&pm->lock);
/* be sure there is something to signal re-checking under PM lock */
if (READ_ONCE(pm->accept_addr) &&
mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
pm->remote = *addr;
spin_unlock_bh(&pm->lock);
}
/* path manager helpers */
bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_addr_info *saddr)
{
int ret = false;
spin_lock_bh(&msk->pm.lock);
/* double check after the lock is acquired */
if (!mptcp_pm_should_signal(msk))
goto out_unlock;
if (remaining < mptcp_add_addr_len(msk->pm.local.family))
goto out_unlock;
*saddr = msk->pm.local;
WRITE_ONCE(msk->pm.addr_signal, false);
ret = true;
out_unlock:
spin_unlock_bh(&msk->pm.lock);
return ret;
}
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
{
return mptcp_pm_nl_get_local_id(msk, skc);
}
static void pm_worker(struct work_struct *work)
{
struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data,
work);
struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm);
struct sock *sk = (struct sock *)msk;
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
pr_debug("msk=%p status=%x", msk, pm->status);
if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
mptcp_pm_nl_add_addr_received(msk);
}
if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
mptcp_pm_nl_fully_established(msk);
}
if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
mptcp_pm_nl_subflow_established(msk);
}
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
sock_put(sk);
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
msk->pm.add_addr_accepted = 0;
msk->pm.local_addr_used = 0;
msk->pm.subflows = 0;
WRITE_ONCE(msk->pm.work_pending, false);
WRITE_ONCE(msk->pm.addr_signal, false);
WRITE_ONCE(msk->pm.accept_addr, false);
WRITE_ONCE(msk->pm.accept_subflow, false);
msk->pm.status = 0;
spin_lock_init(&msk->pm.lock);
INIT_WORK(&msk->pm.work, pm_worker);
mptcp_pm_nl_data_init(msk);
}
void mptcp_pm_close(struct mptcp_sock *msk)
{
if (cancel_work_sync(&msk->pm.work))
sock_put((struct sock *)msk);
}
void mptcp_pm_init(void)
{
pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
if (!pm_wq)
panic("Failed to allocate workqueue");
mptcp_pm_nl_init();
}
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2020, Red Hat, Inc.
*/
#include <linux/inet.h>
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
#include <net/mptcp.h>
#include <net/genetlink.h>
#include <uapi/linux/mptcp.h>
#include "protocol.h"
/* forward declaration */
static struct genl_family mptcp_genl_family;
static int pm_nl_pernet_id;
struct mptcp_pm_addr_entry {
struct list_head list;
unsigned int flags;
int ifindex;
struct mptcp_addr_info addr;
struct rcu_head rcu;
};
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
struct list_head local_addr_list;
unsigned int addrs;
unsigned int add_addr_signal_max;
unsigned int add_addr_accept_max;
unsigned int local_addr_max;
unsigned int subflows_max;
unsigned int next_id;
};
#define MPTCP_PM_ADDR_MAX 8
static bool addresses_equal(const struct mptcp_addr_info *a,
struct mptcp_addr_info *b, bool use_port)
{
bool addr_equals = false;
if (a->family != b->family)
return false;
if (a->family == AF_INET)
addr_equals = a->addr.s_addr == b->addr.s_addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else
addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
#endif
if (!addr_equals)
return false;
if (!use_port)
return true;
return a->port == b->port;
}
static void local_address(const struct sock_common *skc,
struct mptcp_addr_info *addr)
{
addr->port = 0;
addr->family = skc->skc_family;
if (addr->family == AF_INET)
addr->addr.s_addr = skc->skc_rcv_saddr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->family == AF_INET6)
addr->addr6 = skc->skc_v6_rcv_saddr;
#endif
}
static void remote_address(const struct sock_common *skc,
struct mptcp_addr_info *addr)
{
addr->family = skc->skc_family;
addr->port = skc->skc_dport;
if (addr->family == AF_INET)
addr->addr.s_addr = skc->skc_daddr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->family == AF_INET6)
addr->addr6 = skc->skc_v6_daddr;
#endif
}
static bool lookup_subflow_by_saddr(const struct list_head *list,
struct mptcp_addr_info *saddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
struct sock_common *skc;
list_for_each_entry(subflow, list, node) {
skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
local_address(skc, &cur);
if (addresses_equal(&cur, saddr, false))
return true;
}
return false;
}
static struct mptcp_pm_addr_entry *
select_local_address(const struct pm_nl_pernet *pernet,
struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *ret = NULL;
rcu_read_lock();
spin_lock_bh(&msk->join_list_lock);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
/* avoid any address already in use by subflows and
* pending join
*/
if (entry->addr.family == ((struct sock *)msk)->sk_family &&
!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
!lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) {
ret = entry;
break;
}
}
spin_unlock_bh(&msk->join_list_lock);
rcu_read_unlock();
return ret;
}
static struct mptcp_pm_addr_entry *
select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
{
struct mptcp_pm_addr_entry *entry, *ret = NULL;
int i = 0;
rcu_read_lock();
/* do not keep any additional per socket state, just signal
* the address list in order.
* Note: removal from the local address list during the msk life-cycle
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
if (i++ == pos) {
ret = entry;
break;
}
}
rcu_read_unlock();
return ret;
}
static void check_work_pending(struct mptcp_sock *msk)
{
if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max &&
(msk->pm.local_addr_used == msk->pm.local_addr_max ||
msk->pm.subflows == msk->pm.subflows_max))
WRITE_ONCE(msk->pm.work_pending, false);
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *local;
struct mptcp_addr_info remote;
struct pm_nl_pernet *pernet;
pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
msk->pm.local_addr_used, msk->pm.local_addr_max,
msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max,
msk->pm.subflows, msk->pm.subflows_max);
/* check first for announce */
if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) {
local = select_signal_address(pernet,
msk->pm.add_addr_signaled);
if (local) {
msk->pm.add_addr_signaled++;
mptcp_pm_announce_addr(msk, &local->addr);
} else {
/* pick failed, avoid fourther attempts later */
msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
}
check_work_pending(msk);
}
/* check if should create a new subflow */
if (msk->pm.local_addr_used < msk->pm.local_addr_max &&
msk->pm.subflows < msk->pm.subflows_max) {
remote_address((struct sock_common *)sk, &remote);
local = select_local_address(pernet, msk);
if (local) {
msk->pm.local_addr_used++;
msk->pm.subflows++;
check_work_pending(msk);
spin_unlock_bh(&msk->pm.lock);
__mptcp_subflow_connect(sk, local->ifindex,
&local->addr, &remote);
spin_lock_bh(&msk->pm.lock);
return;
}
/* lookup failed, avoid fourther attempts later */
msk->pm.local_addr_used = msk->pm.local_addr_max;
check_work_pending(msk);
}
}
void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info remote;
struct mptcp_addr_info local;
pr_debug("accepted %d:%d remote family %d",
msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max,
msk->pm.remote.family);
msk->pm.add_addr_accepted++;
msk->pm.subflows++;
if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max ||
msk->pm.subflows >= msk->pm.subflows_max)
WRITE_ONCE(msk->pm.accept_addr, false);
/* connect to the specified remote address, using whatever
* local address the routing configuration will pick.
*/
remote = msk->pm.remote;
if (!remote.port)
remote.port = sk->sk_dport;
memset(&local, 0, sizeof(local));
local.family = remote.family;
spin_unlock_bh(&msk->pm.lock);
__mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
spin_lock_bh(&msk->pm.lock);
}
static bool address_use_port(struct mptcp_pm_addr_entry *entry)
{
return (entry->flags &
(MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry)
{
struct mptcp_pm_addr_entry *cur;
int ret = -EINVAL;
spin_lock_bh(&pernet->lock);
/* to keep the code simple, don't do IDR-like allocation for address ID,
* just bail when we exceed limits
*/
if (pernet->next_id > 255)
goto out;
if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
goto out;
/* do not insert duplicate address, differentiate on port only
* singled addresses
*/
list_for_each_entry(cur, &pernet->local_addr_list, list) {
if (addresses_equal(&cur->addr, &entry->addr,
address_use_port(entry) &&
address_use_port(cur)))
goto out;
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max++;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max++;
entry->addr.id = pernet->next_id++;
pernet->addrs++;
list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
ret = entry->addr.id;
out:
spin_unlock_bh(&pernet->lock);
return ret;
}
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
{
struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info skc_local;
struct mptcp_addr_info msk_local;
struct pm_nl_pernet *pernet;
int ret = -1;
if (WARN_ON_ONCE(!msk))
return -1;
/* The 0 ID mapping is defined by the first subflow, copied into the msk
* addr
*/
local_address((struct sock_common *)msk, &msk_local);
local_address((struct sock_common *)msk, &skc_local);
if (addresses_equal(&msk_local, &skc_local, false))
return 0;
pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (addresses_equal(&entry->addr, &skc_local, false)) {
ret = entry->addr.id;
break;
}
}
rcu_read_unlock();
if (ret >= 0)
return ret;
/* address not found, add to local list */
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->flags = 0;
entry->addr = skc_local;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
return ret;
}
void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
struct pm_nl_pernet *pernet;
bool subflows;
pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max);
pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max);
pm->local_addr_max = READ_ONCE(pernet->local_addr_max);
pm->subflows_max = READ_ONCE(pernet->subflows_max);
subflows = !!pm->subflows_max;
WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) ||
!!pm->add_addr_signal_max);
WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows);
WRITE_ONCE(pm->accept_subflow, subflows);
}
#define MPTCP_PM_CMD_GRP_OFFSET 0
static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
[MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
};
static const struct nla_policy
mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
[MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
[MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
[MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
[MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN,
.len = sizeof(struct in6_addr), },
[MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
[MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
[MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
};
static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
[MPTCP_PM_ATTR_ADDR] =
NLA_POLICY_NESTED(mptcp_pm_addr_policy),
[MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
[MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
};
static int mptcp_pm_family_to_addr(int family)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (family == AF_INET6)
return MPTCP_PM_ADDR_ATTR_ADDR6;
#endif
return MPTCP_PM_ADDR_ATTR_ADDR4;
}
static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
bool require_family,
struct mptcp_pm_addr_entry *entry)
{
struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
int err, addr_addr;
if (!attr) {
GENL_SET_ERR_MSG(info, "missing address info");
return -EINVAL;
}
/* no validation needed - was already done via nested policy */
err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
mptcp_pm_addr_policy, info->extack);
if (err)
return err;
memset(entry, 0, sizeof(*entry));
if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
if (!require_family)
goto skip_family;
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"missing family");
return -EINVAL;
}
entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
if (entry->addr.family != AF_INET
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
&& entry->addr.family != AF_INET6
#endif
) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"unknown address family");
return -EINVAL;
}
addr_addr = mptcp_pm_family_to_addr(entry->addr.family);
if (!tb[addr_addr]) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"missing address data");
return -EINVAL;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (entry->addr.family == AF_INET6)
entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]);
else
#endif
entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
skip_family:
if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
if (tb[MPTCP_PM_ADDR_ATTR_ID])
entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
return 0;
}
static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
{
return net_generic(genl_info_net(info), pm_nl_pernet_id);
}
static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
int ret;
ret = mptcp_pm_parse_addr(attr, info, true, &addr);
if (ret < 0)
return ret;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
return -ENOMEM;
}
*entry = addr;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0) {
GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
kfree(entry);
return ret;
}
return 0;
}
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, &pernet->local_addr_list, list) {
if (entry->addr.id == id)
return entry;
}
return NULL;
}
static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
int ret;
ret = mptcp_pm_parse_addr(attr, info, false, &addr);
if (ret < 0)
return ret;
spin_lock_bh(&pernet->lock);
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
GENL_SET_ERR_MSG(info, "address not found");
ret = -EINVAL;
goto out;
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max--;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max--;
pernet->addrs--;
list_del_rcu(&entry->list);
kfree_rcu(entry, rcu);
out:
spin_unlock_bh(&pernet->lock);
return ret;
}
static void __flush_addrs(struct pm_nl_pernet *pernet)
{
while (!list_empty(&pernet->local_addr_list)) {
struct mptcp_pm_addr_entry *cur;
cur = list_entry(pernet->local_addr_list.next,
struct mptcp_pm_addr_entry, list);
list_del_rcu(&cur->list);
kfree_rcu(cur, rcu);
}
}
static void __reset_counters(struct pm_nl_pernet *pernet)
{
pernet->add_addr_signal_max = 0;
pernet->add_addr_accept_max = 0;
pernet->local_addr_max = 0;
pernet->addrs = 0;
}
static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
spin_lock_bh(&pernet->lock);
__flush_addrs(pernet);
__reset_counters(pernet);
spin_unlock_bh(&pernet->lock);
return 0;
}
static int mptcp_nl_fill_addr(struct sk_buff *skb,
struct mptcp_pm_addr_entry *entry)
{
struct mptcp_addr_info *addr = &entry->addr;
struct nlattr *attr;
attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR);
if (!attr)
return -EMSGSIZE;
if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
goto nla_put_failure;
if (entry->ifindex &&
nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
goto nla_put_failure;
if (addr->family == AF_INET)
nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
addr->addr.s_addr);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->family == AF_INET6)
nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6);
#endif
nla_nest_end(skb, attr);
return 0;
nla_put_failure:
nla_nest_cancel(skb, attr);
return -EMSGSIZE;
}
static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
struct sk_buff *msg;
void *reply;
int ret;
ret = mptcp_pm_parse_addr(attr, info, false, &addr);
if (ret < 0)
return ret;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
info->genlhdr->cmd);
if (!reply) {
GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
ret = -EMSGSIZE;
goto fail;
}
spin_lock_bh(&pernet->lock);
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
GENL_SET_ERR_MSG(info, "address not found");
ret = -EINVAL;
goto unlock_fail;
}
ret = mptcp_nl_fill_addr(msg, entry);
if (ret)
goto unlock_fail;
genlmsg_end(msg, reply);
ret = genlmsg_reply(msg, info);
spin_unlock_bh(&pernet->lock);
return ret;
unlock_fail:
spin_unlock_bh(&pernet->lock);
fail:
nlmsg_free(msg);
return ret;
}
static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
struct netlink_callback *cb)
{
struct net *net = sock_net(msg->sk);
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
int id = cb->args[0];
void *hdr;
pernet = net_generic(net, pm_nl_pernet_id);
spin_lock_bh(&pernet->lock);
list_for_each_entry(entry, &pernet->local_addr_list, list) {
if (entry->addr.id <= id)
continue;
hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, &mptcp_genl_family,
NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
if (!hdr)
break;
if (mptcp_nl_fill_addr(msg, entry) < 0) {
genlmsg_cancel(msg, hdr);
break;
}
id = entry->addr.id;
genlmsg_end(msg, hdr);
}
spin_unlock_bh(&pernet->lock);
cb->args[0] = id;
return msg->len;
}
static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
{
struct nlattr *attr = info->attrs[id];
if (!attr)
return 0;
*limit = nla_get_u32(attr);
if (*limit > MPTCP_PM_ADDR_MAX) {
GENL_SET_ERR_MSG(info, "limit greater than maximum");
return -EINVAL;
}
return 0;
}
static int
mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
unsigned int rcv_addrs, subflows;
int ret;
spin_lock_bh(&pernet->lock);
rcv_addrs = pernet->add_addr_accept_max;
ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
if (ret)
goto unlock;
subflows = pernet->subflows_max;
ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
if (ret)
goto unlock;
WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
WRITE_ONCE(pernet->subflows_max, subflows);
unlock:
spin_unlock_bh(&pernet->lock);
return ret;
}
static int
mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct sk_buff *msg;
void *reply;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
MPTCP_PM_CMD_GET_LIMITS);
if (!reply)
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
READ_ONCE(pernet->add_addr_accept_max)))
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
READ_ONCE(pernet->subflows_max)))
goto fail;
genlmsg_end(msg, reply);
return genlmsg_reply(msg, info);
fail:
GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
nlmsg_free(msg);
return -EMSGSIZE;
}
static struct genl_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_ADD_ADDR,
.doit = mptcp_nl_cmd_add_addr,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_DEL_ADDR,
.doit = mptcp_nl_cmd_del_addr,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
.doit = mptcp_nl_cmd_flush_addrs,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_GET_ADDR,
.doit = mptcp_nl_cmd_get_addr,
.dumpit = mptcp_nl_cmd_dump_addrs,
},
{
.cmd = MPTCP_PM_CMD_SET_LIMITS,
.doit = mptcp_nl_cmd_set_limits,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_GET_LIMITS,
.doit = mptcp_nl_cmd_get_limits,
},
};
static struct genl_family mptcp_genl_family __ro_after_init = {
.name = MPTCP_PM_NAME,
.version = MPTCP_PM_VER,
.maxattr = MPTCP_PM_ATTR_MAX,
.policy = mptcp_pm_policy,
.netnsok = true,
.module = THIS_MODULE,
.ops = mptcp_pm_ops,
.n_ops = ARRAY_SIZE(mptcp_pm_ops),
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
static int __net_init pm_nl_init_net(struct net *net)
{
struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
__reset_counters(pernet);
pernet->next_id = 1;
spin_lock_init(&pernet->lock);
return 0;
}
static void __net_exit pm_nl_exit_net(struct list_head *net_list)
{
struct net *net;
list_for_each_entry(net, net_list, exit_list) {
/* net is removed from namespace list, can't race with
* other modifiers
*/
__flush_addrs(net_generic(net, pm_nl_pernet_id));
}
}
static struct pernet_operations mptcp_pm_pernet_ops = {
.init = pm_nl_init_net,
.exit_batch = pm_nl_exit_net,
.id = &pm_nl_pernet_id,
.size = sizeof(struct pm_nl_pernet),
};
void mptcp_pm_nl_init(void)
{
if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
panic("Failed to register MPTCP PM pernet subsystem.\n");
if (genl_register_family(&mptcp_genl_family))
panic("Failed to register MPTCP PM netlink family\n");
}
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#endif #endif
#include <net/mptcp.h> #include <net/mptcp.h>
#include "protocol.h" #include "protocol.h"
#include "mib.h"
#define MPTCP_SAME_STATE TCP_MAX_STATES #define MPTCP_SAME_STATE TCP_MAX_STATES
...@@ -37,6 +38,8 @@ struct mptcp_skb_cb { ...@@ -37,6 +38,8 @@ struct mptcp_skb_cb {
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
static struct percpu_counter mptcp_sockets_allocated;
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket. * completed yet or has failed, return the subflow socket.
* Otherwise return NULL. * Otherwise return NULL.
...@@ -104,19 +107,6 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) ...@@ -104,19 +107,6 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
return ssock; return ssock;
} }
static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
sock_owned_by_me((const struct sock *)msk);
mptcp_for_each_subflow(msk, subflow) {
return mptcp_subflow_tcp_sock(subflow);
}
return NULL;
}
static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
struct sk_buff *skb, struct sk_buff *skb,
unsigned int offset, size_t copy_len) unsigned int offset, size_t copy_len)
...@@ -254,6 +244,60 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) ...@@ -254,6 +244,60 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
sk->sk_data_ready(sk); sk->sk_data_ready(sk);
} }
static void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
if (likely(list_empty(&msk->join_list)))
return;
spin_lock_bh(&msk->join_list_lock);
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
}
static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
{
long tout = ssk && inet_csk(ssk)->icsk_pending ?
inet_csk(ssk)->icsk_timeout - jiffies : 0;
if (tout <= 0)
tout = mptcp_sk(sk)->timer_ival;
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
static bool mptcp_timer_pending(struct sock *sk)
{
return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
}
static void mptcp_reset_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long tout;
/* should never be called with mptcp level timer cleared */
tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
if (WARN_ON_ONCE(!tout))
tout = TCP_RTO_MIN;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}
void mptcp_data_acked(struct sock *sk)
{
mptcp_reset_timer(sk);
if (!sk_stream_is_writeable(sk) &&
schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk);
}
static void mptcp_stop_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
mptcp_sk(sk)->timer_ival = 0;
}
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{ {
if (!msk->cached_ext) if (!msk->cached_ext)
...@@ -277,7 +321,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) ...@@ -277,7 +321,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
return NULL; return NULL;
} }
static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, static bool mptcp_skb_can_collapse_to(u64 write_seq,
const struct sk_buff *skb, const struct sk_buff *skb,
const struct mptcp_ext *mpext) const struct mptcp_ext *mpext)
{ {
...@@ -285,33 +329,141 @@ static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, ...@@ -285,33 +329,141 @@ static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
return false; return false;
/* can collapse only if MPTCP level sequence is in order */ /* can collapse only if MPTCP level sequence is in order */
return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; return mpext && mpext->data_seq + mpext->data_len == write_seq;
}
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
const struct page_frag *pfrag,
const struct mptcp_data_frag *df)
{
return df && pfrag->page == df->page &&
df->data_seq + df->data_len == msk->write_seq;
}
static void dfrag_uncharge(struct sock *sk, int len)
{
sk_mem_uncharge(sk, len);
sk_wmem_queued_add(sk, -len);
}
static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
{
int len = dfrag->data_len + dfrag->overhead;
list_del(&dfrag->list);
dfrag_uncharge(sk, len);
put_page(dfrag->page);
}
static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
u64 snd_una = atomic64_read(&msk->snd_una);
bool cleaned = false;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
dfrag_clear(sk, dfrag);
cleaned = true;
}
dfrag = mptcp_rtx_head(sk);
if (dfrag && after64(snd_una, dfrag->data_seq)) {
u64 delta = dfrag->data_seq + dfrag->data_len - snd_una;
dfrag->data_seq += delta;
dfrag->data_len -= delta;
dfrag_uncharge(sk, delta);
cleaned = true;
}
if (cleaned) {
sk_mem_reclaim_partial(sk);
/* Only wake up writers if a subflow is ready */
if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
sk_stream_write_space(sk);
}
}
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
* data
*/
static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
pfrag, sk->sk_allocation)))
return true;
sk->sk_prot->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
return false;
}
static struct mptcp_data_frag *
mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
int orig_offset)
{
int offset = ALIGN(orig_offset, sizeof(long));
struct mptcp_data_frag *dfrag;
dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
dfrag->data_len = 0;
dfrag->data_seq = msk->write_seq;
dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
dfrag->offset = offset + sizeof(struct mptcp_data_frag);
dfrag->page = pfrag->page;
return dfrag;
} }
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct msghdr *msg, long *timeo, int *pmss_now, struct msghdr *msg, struct mptcp_data_frag *dfrag,
long *timeo, int *pmss_now,
int *ps_goal) int *ps_goal)
{ {
int mss_now, avail_size, size_goal, ret; int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
bool dfrag_collapsed, can_collapse = false;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL; struct mptcp_ext *mpext = NULL;
bool retransmission = !!dfrag;
struct sk_buff *skb, *tail; struct sk_buff *skb, *tail;
bool can_collapse = false;
struct page_frag *pfrag; struct page_frag *pfrag;
struct page *page;
u64 *write_seq;
size_t psize; size_t psize;
/* use the mptcp page cache so that we can easily move the data /* use the mptcp page cache so that we can easily move the data
* from one substream to another, but do per subflow memory accounting * from one substream to another, but do per subflow memory accounting
* Note: pfrag is used only !retransmission, but the compiler if
* fooled into a warning if we don't init here
*/ */
pfrag = sk_page_frag(sk); pfrag = sk_page_frag(sk);
while (!sk_page_frag_refill(ssk, pfrag) || while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
!mptcp_ext_cache_refill(msk)) { !mptcp_ext_cache_refill(msk)) {
ret = sk_stream_wait_memory(ssk, timeo); ret = sk_stream_wait_memory(ssk, timeo);
if (ret) if (ret)
return ret; return ret;
/* if sk_stream_wait_memory() sleeps snd_una can change
* significantly, refresh the rtx queue
*/
mptcp_clean_una(sk);
if (unlikely(__mptcp_needs_tcp_fallback(msk))) if (unlikely(__mptcp_needs_tcp_fallback(msk)))
return 0; return 0;
} }
if (!retransmission) {
write_seq = &msk->write_seq;
page = pfrag->page;
} else {
write_seq = &dfrag->data_seq;
page = dfrag->page;
}
/* compute copy limit */ /* compute copy limit */
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
...@@ -329,33 +481,75 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -329,33 +481,75 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* SSN association set here * SSN association set here
*/ */
can_collapse = (size_goal - skb->len > 0) && can_collapse = (size_goal - skb->len > 0) &&
mptcp_skb_can_collapse_to(msk, skb, mpext); mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
if (!can_collapse) if (!can_collapse)
TCP_SKB_CB(skb)->eor = 1; TCP_SKB_CB(skb)->eor = 1;
else else
avail_size = size_goal - skb->len; avail_size = size_goal - skb->len;
} }
psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
if (!retransmission) {
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
*/
dfrag = mptcp_rtx_tail(sk);
offset = pfrag->offset;
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
if (!dfrag_collapsed) {
dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
offset = dfrag->offset;
frag_truesize = dfrag->overhead;
}
psize = min_t(size_t, pfrag->size - offset, avail_size);
/* Copy to page */ /* Copy to page */
pr_debug("left=%zu", msg_data_left(msg)); pr_debug("left=%zu", msg_data_left(msg));
psize = copy_page_from_iter(pfrag->page, pfrag->offset, psize = copy_page_from_iter(pfrag->page, offset,
min_t(size_t, msg_data_left(msg), psize), min_t(size_t, msg_data_left(msg),
psize),
&msg->msg_iter); &msg->msg_iter);
pr_debug("left=%zu", msg_data_left(msg)); pr_debug("left=%zu", msg_data_left(msg));
if (!psize) if (!psize)
return -EINVAL; return -EINVAL;
if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
return -ENOMEM;
} else {
offset = dfrag->offset;
psize = min_t(size_t, dfrag->data_len, avail_size);
}
/* tell the TCP stack to delay the push so that we can safely /* tell the TCP stack to delay the push so that we can safely
* access the skb after the sendpages call * access the skb after the sendpages call
*/ */
ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, ret = do_tcp_sendpages(ssk, page, offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST); msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0) if (ret <= 0)
return ret; return ret;
frag_truesize += ret;
if (!retransmission) {
if (unlikely(ret < psize)) if (unlikely(ret < psize))
iov_iter_revert(&msg->msg_iter, psize - ret); iov_iter_revert(&msg->msg_iter, psize - ret);
/* send successful, keep track of sent data for mptcp-level
* retransmission
*/
dfrag->data_len += ret;
if (!dfrag_collapsed) {
get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue);
sk_wmem_queued_add(sk, frag_truesize);
} else {
sk_wmem_queued_add(sk, ret);
}
/* charge data on mptcp rtx queue to the master socket
* Note: we charge such data both to sk and ssk
*/
sk->sk_forward_alloc -= frag_truesize;
}
/* if the tail skb extension is still the cached one, collapsing /* if the tail skb extension is still the cached one, collapsing
* really happened. Note: we can't check for 'same skb' as the sk_buff * really happened. Note: we can't check for 'same skb' as the sk_buff
* hdr on tail can be transmitted, freed and re-allocated by the * hdr on tail can be transmitted, freed and re-allocated by the
...@@ -373,7 +567,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -373,7 +567,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
msk->cached_ext = NULL; msk->cached_ext = NULL;
memset(mpext, 0, sizeof(*mpext)); memset(mpext, 0, sizeof(*mpext));
mpext->data_seq = msk->write_seq; mpext->data_seq = *write_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret; mpext->data_len = ret;
mpext->use_map = 1; mpext->use_map = 1;
...@@ -384,13 +578,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -384,13 +578,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->dsn64); mpext->dsn64);
out: out:
pfrag->offset += ret; if (!retransmission)
msk->write_seq += ret; pfrag->offset += frag_truesize;
*write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
return ret; return ret;
} }
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *backup = NULL;
sock_owned_by_me((const struct sock *)msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!sk_stream_memory_free(ssk)) {
struct socket *sock = ssk->sk_socket;
if (sock) {
clear_bit(MPTCP_SEND_SPACE, &msk->flags);
smp_mb__after_atomic();
/* enables sk->write_space() callbacks */
set_bit(SOCK_NOSPACE, &sock->flags);
}
return NULL;
}
if (subflow->backup) {
if (!backup)
backup = ssk;
continue;
}
return ssk;
}
return backup;
}
static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
{ {
struct socket *sock; struct socket *sock;
...@@ -438,17 +670,29 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -438,17 +670,29 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return ret >= 0 ? ret + copied : (copied ? copied : ret); return ret >= 0 ? ret + copied : (copied ? copied : ret);
} }
ssk = mptcp_subflow_get(msk); mptcp_clean_una(sk);
if (!ssk) {
release_sock(sk); __mptcp_flush_join_list(msk);
return -ENOTCONN; ssk = mptcp_subflow_get_send(msk);
while (!sk_stream_memory_free(sk) || !ssk) {
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
mptcp_clean_una(sk);
ssk = mptcp_subflow_get_send(msk);
if (list_empty(&msk->conn_list)) {
ret = -ENOTCONN;
goto out;
}
} }
pr_debug("conn_list->subflow=%p", ssk); pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk); lock_sock(ssk);
while (msg_data_left(msg)) { while (msg_data_left(msg)) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal); &size_goal);
if (ret < 0) if (ret < 0)
break; break;
...@@ -461,10 +705,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -461,10 +705,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
copied += ret; copied += ret;
} }
mptcp_set_timeout(sk, ssk);
if (copied) { if (copied) {
ret = copied; ret = copied;
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
size_goal); size_goal);
/* start the timer, if it's not pending */
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
} }
ssk_check_wmem(msk, ssk); ssk_check_wmem(msk, ssk);
...@@ -572,6 +821,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -572,6 +821,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
len = min_t(size_t, len, INT_MAX); len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
__mptcp_flush_join_list(msk);
while (len > (size_t)copied) { while (len > (size_t)copied) {
int bytes_read; int bytes_read;
...@@ -651,6 +901,69 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -651,6 +901,69 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return copied; return copied;
} }
static void mptcp_retransmit_handler(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (atomic64_read(&msk->snd_una) == msk->write_seq) {
mptcp_stop_timer(sk);
} else {
set_bit(MPTCP_WORK_RTX, &msk->flags);
if (schedule_work(&msk->work))
sock_hold(sk);
}
}
static void mptcp_retransmit_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk = from_timer(icsk, t,
icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
mptcp_retransmit_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
sock_put(sk);
}
/* Find an idle subflow. Return NULL if there is unacked data at tcp
* level.
*
* A backup subflow is returned only if that is the only kind available.
*/
static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *backup = NULL;
sock_owned_by_me((const struct sock *)msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
/* still data outstanding at TCP level? Don't retransmit. */
if (!tcp_write_queue_empty(ssk))
return NULL;
if (subflow->backup) {
if (!backup)
backup = ssk;
continue;
}
return ssk;
}
return backup;
}
/* subflow sockets can be either outgoing (connect) or incoming /* subflow sockets can be either outgoing (connect) or incoming
* (accept). * (accept).
* *
...@@ -684,10 +997,63 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) ...@@ -684,10 +997,63 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
static void mptcp_worker(struct work_struct *work) static void mptcp_worker(struct work_struct *work)
{ {
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *sk = &msk->sk.icsk_inet.sk; struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
struct mptcp_data_frag *dfrag;
u64 orig_write_seq;
size_t copied = 0;
struct msghdr msg;
long timeo = 0;
lock_sock(sk); lock_sock(sk);
mptcp_clean_una(sk);
__mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk); __mptcp_move_skbs(msk);
if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
goto unlock;
dfrag = mptcp_rtx_head(sk);
if (!dfrag)
goto unlock;
ssk = mptcp_subflow_get_retrans(msk);
if (!ssk)
goto reset_unlock;
lock_sock(ssk);
msg.msg_flags = MSG_DONTWAIT;
orig_len = dfrag->data_len;
orig_offset = dfrag->offset;
orig_write_seq = dfrag->data_seq;
while (dfrag->data_len > 0) {
ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
&size_goal);
if (ret < 0)
break;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
copied += ret;
dfrag->data_len -= ret;
dfrag->offset += ret;
}
if (copied)
tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
size_goal);
dfrag->data_seq = orig_write_seq;
dfrag->offset = orig_offset;
dfrag->data_len = orig_len;
mptcp_set_timeout(sk, ssk);
release_sock(ssk);
reset_unlock:
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
unlock:
release_sock(sk); release_sock(sk);
sock_put(sk); sock_put(sk);
} }
...@@ -696,22 +1062,55 @@ static int __mptcp_init_sock(struct sock *sk) ...@@ -696,22 +1062,55 @@ static int __mptcp_init_sock(struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
spin_lock_init(&msk->join_list_lock);
INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags); __set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker); INIT_WORK(&msk->work, mptcp_worker);
msk->first = NULL; msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
mptcp_pm_data_init(msk);
/* re-use the csk retrans timer for MPTCP-level retrans */
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
return 0; return 0;
} }
static int mptcp_init_sock(struct sock *sk) static int mptcp_init_sock(struct sock *sk)
{ {
if (!mptcp_is_enabled(sock_net(sk))) struct net *net = sock_net(sk);
int ret;
if (!mptcp_is_enabled(net))
return -ENOPROTOOPT; return -ENOPROTOOPT;
return __mptcp_init_sock(sk); if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
ret = __mptcp_init_sock(sk);
if (ret)
return ret;
sk_sockets_allocated_inc(sk);
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
return 0;
}
static void __mptcp_clear_xmit(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
dfrag_clear(sk, dfrag);
} }
static void mptcp_cancel_work(struct sock *sk) static void mptcp_cancel_work(struct sock *sk)
...@@ -767,10 +1166,14 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -767,10 +1166,14 @@ static void mptcp_close(struct sock *sk, long timeout)
mptcp_token_destroy(msk->token); mptcp_token_destroy(msk->token);
inet_sk_state_store(sk, TCP_CLOSE); inet_sk_state_store(sk, TCP_CLOSE);
__mptcp_flush_join_list(msk);
list_splice_init(&msk->conn_list, &conn_list); list_splice_init(&msk->conn_list, &conn_list);
data_fin_tx_seq = msk->write_seq; data_fin_tx_seq = msk->write_seq;
__mptcp_clear_xmit(sk);
release_sock(sk); release_sock(sk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) { list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
...@@ -782,6 +1185,7 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -782,6 +1185,7 @@ static void mptcp_close(struct sock *sk, long timeout)
} }
mptcp_cancel_work(sk); mptcp_cancel_work(sk);
mptcp_pm_close(msk);
__skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue);
...@@ -811,6 +1215,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) ...@@ -811,6 +1215,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
} }
static int mptcp_disconnect(struct sock *sk, int flags)
{
lock_sock(sk);
__mptcp_clear_xmit(sk);
release_sock(sk);
mptcp_cancel_work(sk);
return tcp_disconnect(sk, flags);
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6) #if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{ {
...@@ -854,6 +1267,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) ...@@ -854,6 +1267,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)
} }
msk->write_seq = subflow_req->idsn + 1; msk->write_seq = subflow_req->idsn + 1;
atomic64_set(&msk->snd_una, msk->write_seq);
if (subflow_req->remote_key_valid) { if (subflow_req->remote_key_valid) {
msk->can_ack = true; msk->can_ack = true;
msk->remote_key = subflow_req->remote_key; msk->remote_key = subflow_req->remote_key;
...@@ -920,7 +1334,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, ...@@ -920,7 +1334,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
list_add(&subflow->node, &msk->conn_list); list_add(&subflow->node, &msk->conn_list);
bh_unlock_sock(new_mptcp_sock); bh_unlock_sock(new_mptcp_sock);
__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
local_bh_enable(); local_bh_enable();
} else {
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
} }
return newsk; return newsk;
...@@ -932,6 +1351,8 @@ static void mptcp_destroy(struct sock *sk) ...@@ -932,6 +1351,8 @@ static void mptcp_destroy(struct sock *sk)
if (msk->cached_ext) if (msk->cached_ext)
__skb_ext_put(msk->cached_ext); __skb_ext_put(msk->cached_ext);
sk_sockets_allocated_dec(sk);
} }
static int mptcp_setsockopt(struct sock *sk, int level, int optname, static int mptcp_setsockopt(struct sock *sk, int level, int optname,
...@@ -984,7 +1405,8 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, ...@@ -984,7 +1405,8 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
#define MPTCP_DEFERRED_ALL TCPF_DELACK_TIMER_DEFERRED #define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED)
/* this is very alike tcp_release_cb() but we must handle differently a /* this is very alike tcp_release_cb() but we must handle differently a
* different set of events * different set of events
...@@ -1000,6 +1422,8 @@ static void mptcp_release_cb(struct sock *sk) ...@@ -1000,6 +1422,8 @@ static void mptcp_release_cb(struct sock *sk)
nflags = flags & ~MPTCP_DEFERRED_ALL; nflags = flags & ~MPTCP_DEFERRED_ALL;
} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
sock_release_ownership(sk);
if (flags & TCPF_DELACK_TIMER_DEFERRED) { if (flags & TCPF_DELACK_TIMER_DEFERRED) {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct sock *ssk; struct sock *ssk;
...@@ -1008,6 +1432,11 @@ static void mptcp_release_cb(struct sock *sk) ...@@ -1008,6 +1432,11 @@ static void mptcp_release_cb(struct sock *sk)
if (!ssk || !schedule_work(&msk->work)) if (!ssk || !schedule_work(&msk->work))
__sock_put(sk); __sock_put(sk);
} }
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
mptcp_retransmit_handler(sk);
__sock_put(sk);
}
} }
static int mptcp_get_port(struct sock *sk, unsigned short snum) static int mptcp_get_port(struct sock *sk, unsigned short snum)
...@@ -1031,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk) ...@@ -1031,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk)
u64 ack_seq; u64 ack_seq;
subflow = mptcp_subflow_ctx(ssk); subflow = mptcp_subflow_ctx(ssk);
if (!subflow->mp_capable)
return;
sk = subflow->conn; sk = subflow->conn;
msk = mptcp_sk(sk); msk = mptcp_sk(sk);
if (!subflow->mp_capable) {
MPTCP_INC_STATS(sock_net(sk),
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
return;
}
pr_debug("msk=%p, token=%u", sk, subflow->token); pr_debug("msk=%p, token=%u", sk, subflow->token);
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
...@@ -1055,6 +1486,9 @@ void mptcp_finish_connect(struct sock *ssk) ...@@ -1055,6 +1486,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->can_ack, 1);
atomic64_set(&msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, 0);
} }
static void mptcp_sock_graft(struct sock *sk, struct socket *parent) static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
...@@ -1066,6 +1500,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) ...@@ -1066,6 +1500,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_unlock_bh(&sk->sk_callback_lock); write_unlock_bh(&sk->sk_callback_lock);
} }
bool mptcp_finish_join(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *parent = (void *)msk;
struct socket *parent_sock;
bool ret;
pr_debug("msk=%p, subflow=%p", msk, subflow);
/* mptcp socket already closing? */
if (inet_sk_state_load(parent) != TCP_ESTABLISHED)
return false;
if (!msk->pm.server_side)
return true;
/* passive connection, attach to msk socket */
parent_sock = READ_ONCE(parent->sk_socket);
if (parent_sock && !sk->sk_socket)
mptcp_sock_graft(sk, parent_sock);
ret = mptcp_pm_allow_new_subflow(msk);
if (ret) {
/* active connections are already on conn_list */
spin_lock_bh(&msk->join_list_lock);
if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
list_add_tail(&subflow->node, &msk->join_list);
spin_unlock_bh(&msk->join_list_lock);
}
return ret;
}
bool mptcp_sk_is_subflow(const struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
return subflow->mp_join == 1;
}
static bool mptcp_memory_free(const struct sock *sk, int wake) static bool mptcp_memory_free(const struct sock *sk, int wake)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
...@@ -1077,6 +1551,7 @@ static struct proto mptcp_prot = { ...@@ -1077,6 +1551,7 @@ static struct proto mptcp_prot = {
.name = "MPTCP", .name = "MPTCP",
.owner = THIS_MODULE, .owner = THIS_MODULE,
.init = mptcp_init_sock, .init = mptcp_init_sock,
.disconnect = mptcp_disconnect,
.close = mptcp_close, .close = mptcp_close,
.accept = mptcp_accept, .accept = mptcp_accept,
.setsockopt = mptcp_setsockopt, .setsockopt = mptcp_setsockopt,
...@@ -1089,7 +1564,12 @@ static struct proto mptcp_prot = { ...@@ -1089,7 +1564,12 @@ static struct proto mptcp_prot = {
.hash = inet_hash, .hash = inet_hash,
.unhash = inet_unhash, .unhash = inet_unhash,
.get_port = mptcp_get_port, .get_port = mptcp_get_port,
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.stream_memory_free = mptcp_memory_free, .stream_memory_free = mptcp_memory_free,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock), .obj_size = sizeof(struct mptcp_sock),
.no_autobind = true, .no_autobind = true,
}; };
...@@ -1245,6 +1725,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, ...@@ -1245,6 +1725,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket. /* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack. * This is needed so NOSPACE flag can be set from tcp stack.
*/ */
__mptcp_flush_join_list(msk);
list_for_each_entry(subflow, &msk->conn_list, node) { list_for_each_entry(subflow, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
...@@ -1326,6 +1807,7 @@ static int mptcp_shutdown(struct socket *sock, int how) ...@@ -1326,6 +1807,7 @@ static int mptcp_shutdown(struct socket *sock, int how)
sock->state = SS_CONNECTED; sock->state = SS_CONNECTED;
} }
__mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
...@@ -1376,7 +1858,11 @@ void mptcp_proto_init(void) ...@@ -1376,7 +1858,11 @@ void mptcp_proto_init(void)
{ {
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
panic("Failed to allocate MPTCP pcpu counter\n");
mptcp_subflow_init(); mptcp_subflow_init();
mptcp_pm_init();
if (proto_register(&mptcp_prot, 1) != 0) if (proto_register(&mptcp_prot, 1) != 0)
panic("Failed to register MPTCP proto.\n"); panic("Failed to register MPTCP proto.\n");
......
...@@ -17,6 +17,12 @@ ...@@ -17,6 +17,12 @@
#define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYN BIT(0)
#define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_SYNACK BIT(1)
#define OPTION_MPTCP_MPC_ACK BIT(2) #define OPTION_MPTCP_MPC_ACK BIT(2)
#define OPTION_MPTCP_MPJ_SYN BIT(3)
#define OPTION_MPTCP_MPJ_SYNACK BIT(4)
#define OPTION_MPTCP_MPJ_ACK BIT(5)
#define OPTION_MPTCP_ADD_ADDR BIT(6)
#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
#define OPTION_MPTCP_RM_ADDR BIT(8)
/* MPTCP option subtypes */ /* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0 #define MPTCPOPT_MP_CAPABLE 0
...@@ -33,12 +39,30 @@ ...@@ -33,12 +39,30 @@
#define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_SYNACK 12
#define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK 20
#define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22
#define TCPOLEN_MPTCP_MPJ_SYN 12
#define TCPOLEN_MPTCP_MPJ_SYNACK 16
#define TCPOLEN_MPTCP_MPJ_ACK 24
#define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_BASE 4
#define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK32 4
#define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_ACK64 8
#define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP32 10
#define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_MAP64 14
#define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2
#define TCPOLEN_MPTCP_ADD_ADDR 16
#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18
#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8
#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10
#define TCPOLEN_MPTCP_ADD_ADDR6 28
#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22
#define TCPOLEN_MPTCP_PORT_LEN 2
#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
#define MPTCPOPT_HMAC_LEN 20
#define MPTCPOPT_THMAC_LEN 8
/* MPTCP MP_CAPABLE flags */ /* MPTCP MP_CAPABLE flags */
#define MPTCP_VERSION_MASK (0x0F) #define MPTCP_VERSION_MASK (0x0F)
...@@ -55,9 +79,75 @@ ...@@ -55,9 +79,75 @@
#define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_HAS_ACK BIT(0)
#define MPTCP_DSS_FLAG_MASK (0x1F) #define MPTCP_DSS_FLAG_MASK (0x1F)
/* MPTCP ADD_ADDR flags */
#define MPTCP_ADDR_ECHO BIT(0)
#define MPTCP_ADDR_HMAC_LEN 20
#define MPTCP_ADDR_IPVERSION_4 4
#define MPTCP_ADDR_IPVERSION_6 6
/* MPTCP socket flags */ /* MPTCP socket flags */
#define MPTCP_DATA_READY 0 #define MPTCP_DATA_READY 0
#define MPTCP_SEND_SPACE 1 #define MPTCP_SEND_SPACE 1
#define MPTCP_WORK_RTX 2
static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
{
return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
((nib & 0xF) << 8) | field);
}
#define MPTCP_PM_MAX_ADDR 4
struct mptcp_addr_info {
sa_family_t family;
__be16 port;
u8 id;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct in6_addr addr6;
#endif
};
};
enum mptcp_pm_status {
MPTCP_PM_ADD_ADDR_RECEIVED,
MPTCP_PM_ESTABLISHED,
MPTCP_PM_SUBFLOW_ESTABLISHED,
};
struct mptcp_pm_data {
struct mptcp_addr_info local;
struct mptcp_addr_info remote;
spinlock_t lock; /*protects the whole PM data */
bool addr_signal;
bool server_side;
bool work_pending;
bool accept_addr;
bool accept_subflow;
u8 add_addr_signaled;
u8 add_addr_accepted;
u8 local_addr_used;
u8 subflows;
u8 add_addr_signal_max;
u8 add_addr_accept_max;
u8 local_addr_max;
u8 subflows_max;
u8 status;
struct work_struct work;
};
struct mptcp_data_frag {
struct list_head list;
u64 data_seq;
int data_len;
int offset;
int overhead;
struct page *page;
};
/* MPTCP connection sock */ /* MPTCP connection sock */
struct mptcp_sock { struct mptcp_sock {
...@@ -67,14 +157,20 @@ struct mptcp_sock { ...@@ -67,14 +157,20 @@ struct mptcp_sock {
u64 remote_key; u64 remote_key;
u64 write_seq; u64 write_seq;
u64 ack_seq; u64 ack_seq;
atomic64_t snd_una;
unsigned long timer_ival;
u32 token; u32 token;
unsigned long flags; unsigned long flags;
bool can_ack; bool can_ack;
spinlock_t join_list_lock;
struct work_struct work; struct work_struct work;
struct list_head conn_list; struct list_head conn_list;
struct list_head rtx_queue;
struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */ struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first; struct sock *first;
struct mptcp_pm_data pm;
}; };
#define mptcp_for_each_subflow(__msk, __subflow) \ #define mptcp_for_each_subflow(__msk, __subflow) \
...@@ -85,17 +181,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) ...@@ -85,17 +181,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
return (struct mptcp_sock *)sk; return (struct mptcp_sock *)sk;
} }
static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (list_empty(&msk->rtx_queue))
return NULL;
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
}
static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (list_empty(&msk->rtx_queue))
return NULL;
return list_first_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
}
struct mptcp_subflow_request_sock { struct mptcp_subflow_request_sock {
struct tcp_request_sock sk; struct tcp_request_sock sk;
u16 mp_capable : 1, u16 mp_capable : 1,
mp_join : 1, mp_join : 1,
backup : 1, backup : 1,
remote_key_valid : 1; remote_key_valid : 1;
u8 local_id;
u8 remote_id;
u64 local_key; u64 local_key;
u64 remote_key; u64 remote_key;
u64 idsn; u64 idsn;
u32 token; u32 token;
u32 ssn_offset; u32 ssn_offset;
u64 thmac;
u32 local_nonce;
u32 remote_nonce;
}; };
static inline struct mptcp_subflow_request_sock * static inline struct mptcp_subflow_request_sock *
...@@ -118,16 +239,28 @@ struct mptcp_subflow_context { ...@@ -118,16 +239,28 @@ struct mptcp_subflow_context {
u32 ssn_offset; u32 ssn_offset;
u32 map_data_len; u32 map_data_len;
u32 request_mptcp : 1, /* send MP_CAPABLE */ u32 request_mptcp : 1, /* send MP_CAPABLE */
request_join : 1, /* send MP_JOIN */
request_bkup : 1,
mp_capable : 1, /* remote is MPTCP capable */ mp_capable : 1, /* remote is MPTCP capable */
mp_join : 1, /* remote is JOINing */
fully_established : 1, /* path validated */ fully_established : 1, /* path validated */
pm_notified : 1, /* PM hook called for established status */
conn_finished : 1, conn_finished : 1,
map_valid : 1, map_valid : 1,
mpc_map : 1, mpc_map : 1,
backup : 1,
data_avail : 1, data_avail : 1,
rx_eof : 1, rx_eof : 1,
data_fin_tx_enable : 1, data_fin_tx_enable : 1,
can_ack : 1; /* only after processing the remote a key */ can_ack : 1; /* only after processing the remote a key */
u64 data_fin_tx_seq; u64 data_fin_tx_seq;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
u32 remote_token;
u8 hmac[MPTCPOPT_HMAC_LEN];
u8 local_id;
u8 remote_id;
struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */ struct sock *conn; /* parent mptcp_sock */
...@@ -171,6 +304,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) ...@@ -171,6 +304,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
int mptcp_is_enabled(struct net *net); int mptcp_is_enabled(struct net *net);
bool mptcp_subflow_data_available(struct sock *sk); bool mptcp_subflow_data_available(struct sock *sk);
void mptcp_subflow_init(void); void mptcp_subflow_init(void);
/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, int ifindex,
const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk, static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
...@@ -199,11 +337,14 @@ void mptcp_get_options(const struct sk_buff *skb, ...@@ -199,11 +337,14 @@ void mptcp_get_options(const struct sk_buff *skb,
void mptcp_finish_connect(struct sock *sk); void mptcp_finish_connect(struct sock *sk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
int mptcp_token_new_request(struct request_sock *req); int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(u32 token); void mptcp_token_destroy_request(u32 token);
int mptcp_token_new_connect(struct sock *sk); int mptcp_token_new_connect(struct sock *sk);
int mptcp_token_new_accept(u32 token, struct sock *conn); int mptcp_token_new_accept(u32 token, struct sock *conn);
struct mptcp_sock *mptcp_token_get_sock(u32 token);
void mptcp_token_destroy(u32 token); void mptcp_token_destroy(u32 token);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
...@@ -219,8 +360,48 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) ...@@ -219,8 +360,48 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
mptcp_crypto_key_sha(*key, token, idsn); mptcp_crypto_key_sha(*key, token, idsn);
} }
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
void *hash_out);
void mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
void mptcp_pm_close(struct mptcp_sock *msk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
void mptcp_pm_fully_established(struct mptcp_sock *msk);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
void mptcp_pm_connection_closed(struct mptcp_sock *msk);
void mptcp_pm_subflow_established(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow);
void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id);
static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk)
{
return READ_ONCE(msk->pm.addr_signal);
}
static inline unsigned int mptcp_add_addr_len(int family)
{
if (family == AF_INET)
return TCPOLEN_MPTCP_ADD_ADDR;
return TCPOLEN_MPTCP_ADD_ADDR6;
}
bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_addr_info *saddr);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
void mptcp_pm_nl_init(void);
void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
{ {
...@@ -234,4 +415,6 @@ static inline bool before64(__u64 seq1, __u64 seq2) ...@@ -234,4 +415,6 @@ static inline bool before64(__u64 seq1, __u64 seq2)
#define after64(seq2, seq1) before64(seq1, seq2) #define after64(seq2, seq1) before64(seq1, seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
#endif /* __MPTCP_PROTOCOL_H */ #endif /* __MPTCP_PROTOCOL_H */
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/netdevice.h> #include <linux/netdevice.h>
#include <crypto/algapi.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/inet_common.h> #include <net/inet_common.h>
#include <net/inet_hashtables.h> #include <net/inet_hashtables.h>
...@@ -19,17 +20,42 @@ ...@@ -19,17 +20,42 @@
#endif #endif
#include <net/mptcp.h> #include <net/mptcp.h>
#include "protocol.h" #include "protocol.h"
#include "mib.h"
static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
enum linux_mptcp_mib_field field)
{
MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
}
static int subflow_rebuild_header(struct sock *sk) static int subflow_rebuild_header(struct sock *sk)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
int err = 0; int local_id, err = 0;
if (subflow->request_mptcp && !subflow->token) { if (subflow->request_mptcp && !subflow->token) {
pr_debug("subflow=%p", sk); pr_debug("subflow=%p", sk);
err = mptcp_token_new_connect(sk); err = mptcp_token_new_connect(sk);
} else if (subflow->request_join && !subflow->local_nonce) {
struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
pr_debug("subflow=%p", sk);
do {
get_random_bytes(&subflow->local_nonce, sizeof(u32));
} while (!subflow->local_nonce);
if (subflow->local_id)
goto out;
local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
if (local_id < 0)
return -EINVAL;
subflow->local_id = local_id;
} }
out:
if (err) if (err)
return err; return err;
...@@ -47,6 +73,51 @@ static void subflow_req_destructor(struct request_sock *req) ...@@ -47,6 +73,51 @@ static void subflow_req_destructor(struct request_sock *req)
tcp_request_sock_ops.destructor(req); tcp_request_sock_ops.destructor(req);
} }
static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
void *hmac)
{
u8 msg[8];
put_unaligned_be32(nonce1, &msg[0]);
put_unaligned_be32(nonce2, &msg[4]);
mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
}
/* validate received token and create truncated hmac and nonce for SYN-ACK */
static bool subflow_token_join_request(struct request_sock *req,
const struct sk_buff *skb)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
u8 hmac[MPTCPOPT_HMAC_LEN];
struct mptcp_sock *msk;
int local_id;
msk = mptcp_token_get_sock(subflow_req->token);
if (!msk) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
return false;
}
local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
if (local_id < 0) {
sock_put((struct sock *)msk);
return false;
}
subflow_req->local_id = local_id;
get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
subflow_generate_hmac(msk->local_key, msk->remote_key,
subflow_req->local_nonce,
subflow_req->remote_nonce, hmac);
subflow_req->thmac = get_unaligned_be64(hmac);
sock_put((struct sock *)msk);
return true;
}
static void subflow_init_req(struct request_sock *req, static void subflow_init_req(struct request_sock *req,
const struct sock *sk_listener, const struct sock *sk_listener,
struct sk_buff *skb) struct sk_buff *skb)
...@@ -61,6 +132,7 @@ static void subflow_init_req(struct request_sock *req, ...@@ -61,6 +132,7 @@ static void subflow_init_req(struct request_sock *req,
mptcp_get_options(skb, &rx_opt); mptcp_get_options(skb, &rx_opt);
subflow_req->mp_capable = 0; subflow_req->mp_capable = 0;
subflow_req->mp_join = 0;
subflow_req->remote_key_valid = 0; subflow_req->remote_key_valid = 0;
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
...@@ -71,6 +143,15 @@ static void subflow_init_req(struct request_sock *req, ...@@ -71,6 +143,15 @@ static void subflow_init_req(struct request_sock *req,
return; return;
#endif #endif
if (rx_opt.mptcp.mp_capable) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
if (rx_opt.mptcp.mp_join)
return;
} else if (rx_opt.mptcp.mp_join) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
}
if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
int err; int err;
...@@ -79,6 +160,19 @@ static void subflow_init_req(struct request_sock *req, ...@@ -79,6 +160,19 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->mp_capable = 1; subflow_req->mp_capable = 1;
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
} else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
subflow_req->mp_join = 1;
subflow_req->backup = rx_opt.mptcp.backup;
subflow_req->remote_id = rx_opt.mptcp.join_id;
subflow_req->token = rx_opt.mptcp.token;
subflow_req->remote_nonce = rx_opt.mptcp.nonce;
pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
subflow_req->remote_nonce);
if (!subflow_token_join_request(req, skb)) {
subflow_req->mp_join = 0;
// @@ need to trigger RST
}
} }
} }
...@@ -106,6 +200,25 @@ static void subflow_v6_init_req(struct request_sock *req, ...@@ -106,6 +200,25 @@ static void subflow_v6_init_req(struct request_sock *req,
} }
#endif #endif
/* validate received truncated hmac and create hmac for third ACK */
static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
{
u8 hmac[MPTCPOPT_HMAC_LEN];
u64 thmac;
subflow_generate_hmac(subflow->remote_key, subflow->local_key,
subflow->remote_nonce, subflow->local_nonce,
hmac);
thmac = get_unaligned_be64(hmac);
pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
subflow, subflow->token,
(unsigned long long)thmac,
(unsigned long long)subflow->thmac);
return thmac == subflow->thmac;
}
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
...@@ -118,7 +231,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -118,7 +231,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
parent->sk_state_change(parent); parent->sk_state_change(parent);
} }
if (!subflow->conn_finished) { if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
return;
if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key); subflow->remote_key);
mptcp_finish_connect(sk); mptcp_finish_connect(sk);
...@@ -128,6 +244,33 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) ...@@ -128,6 +244,33 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
subflow->ssn_offset = TCP_SKB_CB(skb)->seq; subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
} }
} else if (subflow->mp_join) {
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
subflow, subflow->thmac,
subflow->remote_nonce);
if (!subflow_thmac_valid(subflow)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
subflow->mp_join = 0;
goto do_reset;
}
subflow_generate_hmac(subflow->local_key, subflow->remote_key,
subflow->local_nonce,
subflow->remote_nonce,
subflow->hmac);
if (skb)
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
if (!mptcp_finish_join(sk))
goto do_reset;
subflow->conn_finished = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
} else {
do_reset:
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_done(sk);
} }
} }
...@@ -178,6 +321,32 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) ...@@ -178,6 +321,32 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
} }
#endif #endif
/* validate hmac received in third ACK */
static bool subflow_hmac_valid(const struct request_sock *req,
const struct tcp_options_received *rx_opt)
{
const struct mptcp_subflow_request_sock *subflow_req;
u8 hmac[MPTCPOPT_HMAC_LEN];
struct mptcp_sock *msk;
bool ret;
subflow_req = mptcp_subflow_rsk(req);
msk = mptcp_token_get_sock(subflow_req->token);
if (!msk)
return false;
subflow_generate_hmac(msk->remote_key, msk->local_key,
subflow_req->remote_nonce,
subflow_req->local_nonce, hmac);
ret = true;
if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
ret = false;
sock_put((struct sock *)msk);
return ret;
}
static struct sock *subflow_syn_recv_sock(const struct sock *sk, static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb, struct sk_buff *skb,
struct request_sock *req, struct request_sock *req,
...@@ -188,6 +357,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -188,6 +357,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
struct mptcp_subflow_request_sock *subflow_req; struct mptcp_subflow_request_sock *subflow_req;
struct tcp_options_received opt_rx; struct tcp_options_received opt_rx;
bool fallback_is_fatal = false;
struct sock *new_msk = NULL; struct sock *new_msk = NULL;
struct sock *child; struct sock *child;
...@@ -221,6 +391,15 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -221,6 +391,15 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
new_msk = mptcp_sk_clone(listener->conn, req); new_msk = mptcp_sk_clone(listener->conn, req);
if (!new_msk) if (!new_msk)
subflow_req->mp_capable = 0; subflow_req->mp_capable = 0;
} else if (subflow_req->mp_join) {
fallback_is_fatal = true;
opt_rx.mptcp.mp_join = 0;
mptcp_get_options(skb, &opt_rx);
if (!opt_rx.mptcp.mp_join ||
!subflow_hmac_valid(req, &opt_rx)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
return NULL;
}
} }
create_child: create_child:
...@@ -230,20 +409,35 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -230,20 +409,35 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
if (child && *own_req) { if (child && *own_req) {
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
/* we have null ctx on TCP fallback, not fatal on MPC /* we have null ctx on TCP fallback, which is fatal on
* handshake * MPJ handshake
*/ */
if (!ctx) if (!ctx) {
if (fallback_is_fatal)
goto close_child;
goto out; goto out;
}
if (ctx->mp_capable) { if (ctx->mp_capable) {
/* new mpc subflow takes ownership of the newly /* new mpc subflow takes ownership of the newly
* created mptcp socket * created mptcp socket
*/ */
inet_sk_state_store((struct sock *)new_msk, inet_sk_state_store(new_msk, TCP_ESTABLISHED);
TCP_ESTABLISHED); mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
ctx->conn = new_msk; ctx->conn = new_msk;
new_msk = NULL; new_msk = NULL;
} else if (ctx->mp_join) {
struct mptcp_sock *owner;
owner = mptcp_token_get_sock(ctx->token);
if (!owner)
goto close_child;
ctx->conn = (struct sock *)owner;
if (!mptcp_finish_join(child))
goto close_child;
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
} }
} }
...@@ -252,6 +446,12 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ...@@ -252,6 +446,12 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
if (unlikely(new_msk)) if (unlikely(new_msk))
sock_put(new_msk); sock_put(new_msk);
return child; return child;
close_child:
tcp_send_active_reset(child, GFP_ATOMIC);
inet_csk_prepare_forced_close(child);
tcp_done(child);
return NULL;
} }
static struct inet_connection_sock_af_ops subflow_specific; static struct inet_connection_sock_af_ops subflow_specific;
...@@ -353,6 +553,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk) ...@@ -353,6 +553,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
data_len = mpext->data_len; data_len = mpext->data_len;
if (data_len == 0) { if (data_len == 0) {
pr_err("Infinite mapping not handled"); pr_err("Infinite mapping not handled");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
return MAPPING_INVALID; return MAPPING_INVALID;
} }
...@@ -396,8 +597,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk) ...@@ -396,8 +597,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
/* If this skb data are fully covered by the current mapping, /* If this skb data are fully covered by the current mapping,
* the new map would need caching, which is not supported * the new map would need caching, which is not supported
*/ */
if (skb_is_fully_mapped(ssk, skb)) if (skb_is_fully_mapped(ssk, skb)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
return MAPPING_INVALID; return MAPPING_INVALID;
}
/* will validate the next map after consuming the current one */ /* will validate the next map after consuming the current one */
return MAPPING_OK; return MAPPING_OK;
...@@ -566,7 +769,7 @@ static void subflow_data_ready(struct sock *sk) ...@@ -566,7 +769,7 @@ static void subflow_data_ready(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn; struct sock *parent = subflow->conn;
if (!subflow->mp_capable) { if (!subflow->mp_capable && !subflow->mp_join) {
subflow->tcp_data_ready(sk); subflow->tcp_data_ready(sk);
parent->sk_data_ready(parent); parent->sk_data_ready(parent);
...@@ -621,6 +824,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) ...@@ -621,6 +824,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
} }
#endif #endif
static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr)
{
memset(addr, 0, sizeof(*addr));
addr->ss_family = info->family;
if (addr->ss_family == AF_INET) {
struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
in_addr->sin_addr = info->addr;
in_addr->sin_port = info->port;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->ss_family == AF_INET6) {
struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
in6_addr->sin6_addr = info->addr6;
in6_addr->sin6_port = info->port;
}
#endif
}
int __mptcp_subflow_connect(struct sock *sk, int ifindex,
const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
struct sockaddr_storage addr;
struct socket *sf;
u32 remote_token;
int addrlen;
int err;
if (sk->sk_state != TCP_ESTABLISHED)
return -ENOTCONN;
err = mptcp_subflow_create_socket(sk, &sf);
if (err)
return err;
subflow = mptcp_subflow_ctx(sf->sk);
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
subflow->token = msk->token;
mptcp_info2sockaddr(loc, &addr);
addrlen = sizeof(struct sockaddr_in);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
sf->sk->sk_bound_dev_if = ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
pr_debug("msk=%p remote_token=%u", msk, remote_token);
subflow->remote_token = remote_token;
subflow->local_id = loc->id;
subflow->request_join = 1;
subflow->request_bkup = 1;
mptcp_info2sockaddr(remote, &addr);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
if (err && err != -EINPROGRESS)
goto failed;
spin_lock_bh(&msk->join_list_lock);
list_add_tail(&subflow->node, &msk->join_list);
spin_unlock_bh(&msk->join_list_lock);
return err;
failed:
sock_release(sf);
return err;
}
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{ {
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
...@@ -785,7 +1067,8 @@ static void subflow_ulp_clone(const struct request_sock *req, ...@@ -785,7 +1067,8 @@ static void subflow_ulp_clone(const struct request_sock *req,
struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
struct mptcp_subflow_context *new_ctx; struct mptcp_subflow_context *new_ctx;
if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) { if (!tcp_rsk(req)->is_mptcp ||
(!subflow_req->mp_capable && !subflow_req->mp_join)) {
subflow_ulp_fallback(newsk, old_ctx); subflow_ulp_fallback(newsk, old_ctx);
return; return;
} }
...@@ -796,9 +1079,6 @@ static void subflow_ulp_clone(const struct request_sock *req, ...@@ -796,9 +1079,6 @@ static void subflow_ulp_clone(const struct request_sock *req,
return; return;
} }
/* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
* established only after we receive the remote key
*/
new_ctx->conn_finished = 1; new_ctx->conn_finished = 1;
new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
...@@ -807,6 +1087,10 @@ static void subflow_ulp_clone(const struct request_sock *req, ...@@ -807,6 +1087,10 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->rel_write_seq = 1; new_ctx->rel_write_seq = 1;
new_ctx->tcp_sock = newsk; new_ctx->tcp_sock = newsk;
if (subflow_req->mp_capable) {
/* see comments in subflow_syn_recv_sock(), MPTCP connection
* is fully established only after we receive the remote key
*/
new_ctx->mp_capable = 1; new_ctx->mp_capable = 1;
new_ctx->fully_established = subflow_req->remote_key_valid; new_ctx->fully_established = subflow_req->remote_key_valid;
new_ctx->can_ack = subflow_req->remote_key_valid; new_ctx->can_ack = subflow_req->remote_key_valid;
...@@ -815,6 +1099,15 @@ static void subflow_ulp_clone(const struct request_sock *req, ...@@ -815,6 +1099,15 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->token = subflow_req->token; new_ctx->token = subflow_req->token;
new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->idsn = subflow_req->idsn; new_ctx->idsn = subflow_req->idsn;
} else if (subflow_req->mp_join) {
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->mp_join = 1;
new_ctx->fully_established = 1;
new_ctx->backup = subflow_req->backup;
new_ctx->local_id = subflow_req->local_id;
new_ctx->token = subflow_req->token;
new_ctx->thmac = subflow_req->thmac;
}
} }
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
...@@ -876,6 +1169,8 @@ void mptcp_subflow_init(void) ...@@ -876,6 +1169,8 @@ void mptcp_subflow_init(void)
subflow_v6m_specific.net_frag_header_len = 0; subflow_v6m_specific.net_frag_header_len = 0;
#endif #endif
mptcp_diag_subflow_init(&subflow_ulp_ops);
if (tcp_register_ulp(&subflow_ulp_ops) != 0) if (tcp_register_ulp(&subflow_ulp_ops) != 0)
panic("MPTCP: failed to register subflows to ULP\n"); panic("MPTCP: failed to register subflows to ULP\n");
} }
...@@ -140,6 +140,33 @@ int mptcp_token_new_accept(u32 token, struct sock *conn) ...@@ -140,6 +140,33 @@ int mptcp_token_new_accept(u32 token, struct sock *conn)
return err; return err;
} }
/**
* mptcp_token_get_sock - retrieve mptcp connection sock using its token
* @token: token of the mptcp connection to retrieve
*
* This function returns the mptcp connection structure with the given token.
* A reference count on the mptcp socket returned is taken.
*
* returns NULL if no connection with the given token value exists.
*/
struct mptcp_sock *mptcp_token_get_sock(u32 token)
{
struct sock *conn;
spin_lock_bh(&token_tree_lock);
conn = radix_tree_lookup(&token_tree, token);
if (conn) {
/* token still reserved? */
if (conn == (struct sock *)&token_used)
conn = NULL;
else
sock_hold(conn);
}
spin_unlock_bh(&token_tree_lock);
return mptcp_sk(conn);
}
/** /**
* mptcp_token_destroy_request - remove mptcp connection/token * mptcp_token_destroy_request - remove mptcp connection/token
* @token - token of mptcp connection to remove * @token - token of mptcp connection to remove
......
mptcp_connect mptcp_connect
pm_nl_ctl
*.pcap *.pcap
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
top_srcdir = ../../../../.. top_srcdir = ../../../../..
KSFT_KHDR_INSTALL := 1
CFLAGS = -Wall -Wl,--no-as-needed -O2 -g CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include
TEST_PROGS := mptcp_connect.sh TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh
TEST_GEN_FILES = mptcp_connect TEST_GEN_FILES = mptcp_connect pm_nl_ctl
TEST_FILES := settings TEST_FILES := settings
......
...@@ -51,6 +51,7 @@ static bool tcpulp_audit; ...@@ -51,6 +51,7 @@ static bool tcpulp_audit;
static int pf = AF_INET; static int pf = AF_INET;
static int cfg_sndbuf; static int cfg_sndbuf;
static int cfg_rcvbuf; static int cfg_rcvbuf;
static bool cfg_join;
static void die_usage(void) static void die_usage(void)
{ {
...@@ -250,6 +251,7 @@ static int sock_connect_mptcp(const char * const remoteaddr, ...@@ -250,6 +251,7 @@ static int sock_connect_mptcp(const char * const remoteaddr,
static size_t do_rnd_write(const int fd, char *buf, const size_t len) static size_t do_rnd_write(const int fd, char *buf, const size_t len)
{ {
static bool first = true;
unsigned int do_w; unsigned int do_w;
ssize_t bw; ssize_t bw;
...@@ -257,10 +259,19 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len) ...@@ -257,10 +259,19 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len)
if (do_w == 0 || do_w > len) if (do_w == 0 || do_w > len)
do_w = len; do_w = len;
if (cfg_join && first && do_w > 100)
do_w = 100;
bw = write(fd, buf, do_w); bw = write(fd, buf, do_w);
if (bw < 0) if (bw < 0)
perror("write"); perror("write");
/* let the join handshake complete, before going on */
if (cfg_join && first) {
usleep(200000);
first = false;
}
return bw; return bw;
} }
...@@ -385,8 +396,11 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) ...@@ -385,8 +396,11 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd)
break; break;
/* ... but we still receive. /* ... but we still receive.
* Close our write side. * Close our write side, ev. give some time
* for address notification
*/ */
if (cfg_join)
usleep(400000);
shutdown(peerfd, SHUT_WR); shutdown(peerfd, SHUT_WR);
} else { } else {
if (errno == EINTR) if (errno == EINTR)
...@@ -403,6 +417,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) ...@@ -403,6 +417,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd)
} }
} }
/* leave some time for late join/announce */
if (cfg_join)
usleep(400000);
close(peerfd); close(peerfd);
return 0; return 0;
} }
...@@ -658,7 +676,7 @@ static void maybe_close(int fd) ...@@ -658,7 +676,7 @@ static void maybe_close(int fd)
{ {
unsigned int r = rand(); unsigned int r = rand();
if (r & 1) if (!cfg_join && (r & 1))
close(fd); close(fd);
} }
...@@ -794,8 +812,12 @@ static void parse_opts(int argc, char **argv) ...@@ -794,8 +812,12 @@ static void parse_opts(int argc, char **argv)
{ {
int c; int c;
while ((c = getopt(argc, argv, "6lp:s:hut:m:S:R:")) != -1) { while ((c = getopt(argc, argv, "6jlp:s:hut:m:S:R:")) != -1) {
switch (c) { switch (c) {
case 'j':
cfg_join = true;
cfg_mode = CFG_MODE_POLL;
break;
case 'l': case 'l':
listen_mode = true; listen_mode = true;
break; break;
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
ret=0
sin=""
sout=""
cin=""
cout=""
ksft_skip=4
timeout=30
capture=0
TEST_COUNT=0
init()
{
capout=$(mktemp)
rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
ns1="ns1-$rndh"
ns2="ns2-$rndh"
for netns in "$ns1" "$ns2";do
ip netns add $netns || exit $ksft_skip
ip -net $netns link set lo up
ip netns exec $netns sysctl -q net.mptcp.enabled=1
ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0
ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0
done
# ns1 ns2
# ns1eth1 ns2eth1
# ns1eth2 ns2eth2
# ns1eth3 ns2eth3
# ns1eth4 ns2eth4
for i in `seq 1 4`; do
ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2"
ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i
ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad
ip -net "$ns1" link set ns1eth$i up
ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i
ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad
ip -net "$ns2" link set ns2eth$i up
# let $ns2 reach any $ns1 address from any interface
ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i
done
}
cleanup_partial()
{
rm -f "$capout"
for netns in "$ns1" "$ns2"; do
ip netns del $netns
done
}
cleanup()
{
rm -f "$cin" "$cout"
rm -f "$sin" "$sout"
cleanup_partial
}
reset()
{
cleanup_partial
init
}
for arg in "$@"; do
if [ "$arg" = "-c" ]; then
capture=1
fi
done
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
check_transfer()
{
in=$1
out=$2
what=$3
cmp "$in" "$out" > /dev/null 2>&1
if [ $? -ne 0 ] ;then
echo "[ FAIL ] $what does not match (in, out):"
print_file_err "$in"
print_file_err "$out"
return 1
fi
return 0
}
do_ping()
{
listener_ns="$1"
connector_ns="$2"
connect_addr="$3"
ip netns exec ${connector_ns} ping -q -c 1 $connect_addr >/dev/null
if [ $? -ne 0 ] ; then
echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2
ret=1
fi
}
do_transfer()
{
listener_ns="$1"
connector_ns="$2"
cl_proto="$3"
srv_proto="$4"
connect_addr="$5"
port=$((10000+$TEST_COUNT))
TEST_COUNT=$((TEST_COUNT+1))
:> "$cout"
:> "$sout"
:> "$capout"
if [ $capture -eq 1 ]; then
if [ -z $SUDO_USER ] ; then
capuser=""
else
capuser="-Z $SUDO_USER"
fi
capfile="mp_join-${listener_ns}.pcap"
echo "Capturing traffic for test $TEST_COUNT into $capfile"
ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
cappid=$!
sleep 1
fi
ip netns exec ${listener_ns} ./mptcp_connect -j -t $timeout -l -p $port -s ${srv_proto} 0.0.0.0 < "$sin" > "$sout" &
spid=$!
sleep 1
ip netns exec ${connector_ns} ./mptcp_connect -j -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" &
cpid=$!
wait $cpid
retc=$?
wait $spid
rets=$?
if [ $capture -eq 1 ]; then
sleep 1
kill $cappid
fi
if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
echo " client exit code $retc, server $rets" 1>&2
echo "\nnetns ${listener_ns} socket stat for $port:" 1>&2
ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port"
echo "\nnetns ${connector_ns} socket stat for $port:" 1>&2
ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port"
cat "$capout"
return 1
fi
check_transfer $sin $cout "file received by client"
retc=$?
check_transfer $cin $sout "file received by server"
rets=$?
if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
cat "$capout"
return 0
fi
cat "$capout"
return 1
}
make_file()
{
name=$1
who=$2
SIZE=1
dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name"
echo "Created $name (size $SIZE KB) containing data sent by $who"
}
run_tests()
{
listener_ns="$1"
connector_ns="$2"
connect_addr="$3"
lret=0
do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr}
lret=$?
if [ $lret -ne 0 ]; then
ret=$lret
return
fi
}
chk_join_nr()
{
local msg="$1"
local syn_nr=$2
local syn_ack_nr=$3
local ack_nr=$4
local count
local dump_stats
printf "%-36s %s" "$msg" "syn"
count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'`
[ -z "$count" ] && count=0
if [ "$count" != "$syn_nr" ]; then
echo "[fail] got $count JOIN[s] syn expected $syn_nr"
ret=1
dump_stats=1
else
echo -n "[ ok ]"
fi
echo -n " - synack"
count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'`
[ -z "$count" ] && count=0
if [ "$count" != "$syn_ack_nr" ]; then
echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr"
ret=1
dump_stats=1
else
echo -n "[ ok ]"
fi
echo -n " - ack"
count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinAckRx | awk '{print $2}'`
[ -z "$count" ] && count=0
if [ "$count" != "$ack_nr" ]; then
echo "[fail] got $count JOIN[s] ack expected $ack_nr"
ret=1
dump_stats=1
else
echo "[ ok ]"
fi
if [ "${dump_stats}" = 1 ]; then
echo Server ns stats
ip netns exec $ns1 nstat -as | grep MPTcp
echo Client ns stats
ip netns exec $ns2 nstat -as | grep MPTcp
fi
}
sin=$(mktemp)
sout=$(mktemp)
cin=$(mktemp)
cout=$(mktemp)
init
make_file "$cin" "client"
make_file "$sin" "server"
trap cleanup EXIT
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "no JOIN" "0" "0" "0"
# subflow limted by client
reset
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "single subflow, limited by client" 0 0 0
# subflow limted by server
reset
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "single subflow, limited by server" 1 1 0
# subflow
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "single subflow" 1 1 1
# multiple subflows
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "multiple subflows" 2 2 2
# multiple subflows limited by serverf
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "multiple subflows, limited by server" 2 2 1
# add_address, unused
reset
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "unused signal address" 0 0 0
# accept and use add_addr
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 1 1
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "signal address" 1 1 1
# accept and use add_addr with an additional subflow
# note: signal address in server ns and local addresses in client ns must
# belong to different subnets or one of the listed local address could be
# used for 'add_addr' subflow
reset
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl limits 1 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "subflow and signal" 2 2 2
# accept and use add_addr with additional subflows
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 3
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 3
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "multiple subflows and signal" 3 3 3
exit $ret
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
ksft_skip=4
ret=0
usage() {
echo "Usage: $0 [ -h ]"
}
while getopts "$optstring" option;do
case "$option" in
"h")
usage $0
exit 0
;;
"?")
usage $0
exit 1
;;
esac
done
sec=$(date +%s)
rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
ns1="ns1-$rndh"
err=$(mktemp)
ret=0
cleanup()
{
rm -f $out
ip netns del $ns1
}
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
trap cleanup EXIT
ip netns add $ns1 || exit $ksft_skip
ip -net $ns1 link set lo up
ip netns exec $ns1 sysctl -q net.mptcp.enabled=1
check()
{
local cmd="$1"
local expected="$2"
local msg="$3"
local out=`$cmd 2>$err`
local cmd_ret=$?
printf "%-50s %s" "$msg"
if [ $cmd_ret -ne 0 ]; then
echo "[FAIL] command execution '$cmd' stderr "
cat $err
ret=1
elif [ "$out" = "$expected" ]; then
echo "[ OK ]"
else
echo -n "[FAIL] "
echo "expected '$expected' got '$out'"
ret=1
fi
}
check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list"
check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
subflows 0" "defaults limits"
ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1
ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo
ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup
check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1 " "simple add/get addr"
check "ip netns exec $ns1 ./pm_nl_ctl dump" \
"id 1 flags 10.0.1.1
id 2 flags subflow dev lo 10.0.1.2
id 3 flags signal,backup 10.0.1.3 " "dump addrs"
ip netns exec $ns1 ./pm_nl_ctl del 2
check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr"
check "ip netns exec $ns1 ./pm_nl_ctl dump" \
"id 1 flags 10.0.1.1
id 3 flags signal,backup 10.0.1.3 " "dump addrs after del"
ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3
check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr"
ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 id 10 flags signal
check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4 " "id addr increment"
for i in `seq 5 9`; do
ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1
done
check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9 " "hard addr limit"
check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit"
for i in `seq 9 256`; do
ip netns exec $ns1 ./pm_nl_ctl del $i
ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9
done
check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1
id 3 flags signal,backup 10.0.1.3
id 4 flags signal 10.0.1.4
id 5 flags signal 10.0.1.5
id 6 flags signal 10.0.1.6
id 7 flags signal 10.0.1.7
id 8 flags signal 10.0.1.8 " "id limit"
ip netns exec $ns1 ./pm_nl_ctl flush
check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs"
ip netns exec $ns1 ./pm_nl_ctl limits 9 1
check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
subflows 0" "rcv addrs above hard limit"
ip netns exec $ns1 ./pm_nl_ctl limits 1 9
check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
subflows 0" "subflows above hard limit"
ip netns exec $ns1 ./pm_nl_ctl limits 8 8
check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8
subflows 8" "set limits"
exit $ret
// SPDX-License-Identifier: GPL-2.0
#include <errno.h>
#include <error.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <linux/rtnetlink.h>
#include <linux/genetlink.h>
#include "linux/mptcp.h"
#ifndef MPTCP_PM_NAME
#define MPTCP_PM_NAME "mptcp_pm"
#endif
static void syntax(char *argv[])
{
fprintf(stderr, "%s add|get|del|flush|dump|accept [<args>]\n", argv[0]);
fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n");
fprintf(stderr, "\tdel <id>\n");
fprintf(stderr, "\tget <id>\n");
fprintf(stderr, "\tflush\n");
fprintf(stderr, "\tdump\n");
fprintf(stderr, "\tlimits [<rcv addr max> <subflow max>]\n");
exit(0);
}
static int init_genl_req(char *data, int family, int cmd, int version)
{
struct nlmsghdr *nh = (void *)data;
struct genlmsghdr *gh;
int off = 0;
nh->nlmsg_type = family;
nh->nlmsg_flags = NLM_F_REQUEST;
nh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
off += NLMSG_ALIGN(sizeof(*nh));
gh = (void *)(data + off);
gh->cmd = cmd;
gh->version = version;
off += NLMSG_ALIGN(sizeof(*gh));
return off;
}
static void nl_error(struct nlmsghdr *nh)
{
struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(nh);
int len = nh->nlmsg_len - sizeof(*nh);
uint32_t off;
if (len < sizeof(struct nlmsgerr))
error(1, 0, "netlink error message truncated %d min %ld", len,
sizeof(struct nlmsgerr));
if (!err->error) {
/* check messages from kernel */
struct rtattr *attrs = (struct rtattr *)NLMSG_DATA(nh);
while (RTA_OK(attrs, len)) {
if (attrs->rta_type == NLMSGERR_ATTR_MSG)
fprintf(stderr, "netlink ext ack msg: %s\n",
(char *)RTA_DATA(attrs));
if (attrs->rta_type == NLMSGERR_ATTR_OFFS) {
memcpy(&off, RTA_DATA(attrs), 4);
fprintf(stderr, "netlink err off %d\n",
(int)off);
}
attrs = RTA_NEXT(attrs, len);
}
} else {
fprintf(stderr, "netlink error %d", err->error);
}
}
/* do a netlink command and, if max > 0, fetch the reply */
static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max)
{
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
socklen_t addr_len;
void *data = nh;
int rem, ret;
int err = 0;
nh->nlmsg_len = len;
ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr));
if (ret != len)
error(1, errno, "send netlink: %uB != %uB\n", ret, len);
if (max == 0)
return 0;
addr_len = sizeof(nladdr);
rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len);
if (ret < 0)
error(1, errno, "recv netlink: %uB\n", ret);
/* Beware: the NLMSG_NEXT macro updates the 'rem' argument */
for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) {
if (nh->nlmsg_type == NLMSG_ERROR) {
nl_error(nh);
err = 1;
}
}
if (err)
error(1, 0, "bailing out due to netlink error[s]");
return ret;
}
static int genl_parse_getfamily(struct nlmsghdr *nlh)
{
struct genlmsghdr *ghdr = NLMSG_DATA(nlh);
int len = nlh->nlmsg_len;
struct rtattr *attrs;
if (nlh->nlmsg_type != GENL_ID_CTRL)
error(1, errno, "Not a controller message, len=%d type=0x%x\n",
nlh->nlmsg_len, nlh->nlmsg_type);
len -= NLMSG_LENGTH(GENL_HDRLEN);
if (len < 0)
error(1, errno, "wrong controller message len %d\n", len);
if (ghdr->cmd != CTRL_CMD_NEWFAMILY)
error(1, errno, "Unknown controller command %d\n", ghdr->cmd);
attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN);
while (RTA_OK(attrs, len)) {
if (attrs->rta_type == CTRL_ATTR_FAMILY_ID)
return *(__u16 *)RTA_DATA(attrs);
attrs = RTA_NEXT(attrs, len);
}
error(1, errno, "can't find CTRL_ATTR_FAMILY_ID attr");
return -1;
}
static int resolve_mptcp_pm_netlink(int fd)
{
char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1024];
struct nlmsghdr *nh;
struct rtattr *rta;
int namelen;
int off = 0;
memset(data, 0, sizeof(data));
nh = (void *)data;
off = init_genl_req(data, GENL_ID_CTRL, CTRL_CMD_GETFAMILY, 0);
rta = (void *)(data + off);
namelen = strlen(MPTCP_PM_NAME) + 1;
rta->rta_type = CTRL_ATTR_FAMILY_NAME;
rta->rta_len = RTA_LENGTH(namelen);
memcpy(RTA_DATA(rta), MPTCP_PM_NAME, namelen);
off += NLMSG_ALIGN(rta->rta_len);
do_nl_req(fd, nh, off, sizeof(data));
return genl_parse_getfamily((void *)data);
}
int add_addr(int fd, int pm_family, int argc, char *argv[])
{
char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1024];
struct rtattr *rta, *nest;
struct nlmsghdr *nh;
u_int16_t family;
u_int32_t flags;
int nest_start;
u_int8_t id;
int off = 0;
int arg;
memset(data, 0, sizeof(data));
nh = (void *)data;
off = init_genl_req(data, pm_family, MPTCP_PM_CMD_ADD_ADDR,
MPTCP_PM_VER);
if (argc < 3)
syntax(argv);
nest_start = off;
nest = (void *)(data + off);
nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
nest->rta_len = RTA_LENGTH(0);
off += NLMSG_ALIGN(nest->rta_len);
/* addr data */
rta = (void *)(data + off);
if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) {
family = AF_INET;
rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4;
rta->rta_len = RTA_LENGTH(4);
} else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) {
family = AF_INET6;
rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6;
rta->rta_len = RTA_LENGTH(16);
} else
error(1, errno, "can't parse ip %s", argv[2]);
off += NLMSG_ALIGN(rta->rta_len);
/* family */
rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY;
rta->rta_len = RTA_LENGTH(2);
memcpy(RTA_DATA(rta), &family, 2);
off += NLMSG_ALIGN(rta->rta_len);
for (arg = 3; arg < argc; arg++) {
if (!strcmp(argv[arg], "flags")) {
char *tok, *str;
/* flags */
flags = 0;
if (++arg >= argc)
error(1, 0, " missing flags value");
/* do not support flag list yet */
for (str = argv[arg]; (tok = strtok(str, ","));
str = NULL) {
if (!strcmp(tok, "subflow"))
flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW;
else if (!strcmp(tok, "signal"))
flags |= MPTCP_PM_ADDR_FLAG_SIGNAL;
else if (!strcmp(tok, "backup"))
flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
else
error(1, errno,
"unknown flag %s", argv[arg]);
}
rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS;
rta->rta_len = RTA_LENGTH(4);
memcpy(RTA_DATA(rta), &flags, 4);
off += NLMSG_ALIGN(rta->rta_len);
} else if (!strcmp(argv[arg], "id")) {
if (++arg >= argc)
error(1, 0, " missing id value");
id = atoi(argv[arg]);
rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
rta->rta_len = RTA_LENGTH(1);
memcpy(RTA_DATA(rta), &id, 1);
off += NLMSG_ALIGN(rta->rta_len);
} else if (!strcmp(argv[arg], "dev")) {
int32_t ifindex;
if (++arg >= argc)
error(1, 0, " missing dev name");
ifindex = if_nametoindex(argv[arg]);
if (!ifindex)
error(1, errno, "unknown device %s", argv[arg]);
rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ADDR_ATTR_IF_IDX;
rta->rta_len = RTA_LENGTH(4);
memcpy(RTA_DATA(rta), &ifindex, 4);
off += NLMSG_ALIGN(rta->rta_len);
} else
error(1, 0, "unknown keyword %s", argv[arg]);
}
nest->rta_len = off - nest_start;
do_nl_req(fd, nh, off, 0);
return 0;
}
int del_addr(int fd, int pm_family, int argc, char *argv[])
{
char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1024];
struct rtattr *rta, *nest;
struct nlmsghdr *nh;
int nest_start;
u_int8_t id;
int off = 0;
memset(data, 0, sizeof(data));
nh = (void *)data;
off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR,
MPTCP_PM_VER);
/* the only argument is the address id */
if (argc != 3)
syntax(argv);
id = atoi(argv[2]);
nest_start = off;
nest = (void *)(data + off);
nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
nest->rta_len = RTA_LENGTH(0);
off += NLMSG_ALIGN(nest->rta_len);
/* build a dummy addr with only the ID set */
rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
rta->rta_len = RTA_LENGTH(1);
memcpy(RTA_DATA(rta), &id, 1);
off += NLMSG_ALIGN(rta->rta_len);
nest->rta_len = off - nest_start;
do_nl_req(fd, nh, off, 0);
return 0;
}
static void print_addr(struct rtattr *attrs, int len)
{
uint16_t family = 0;
char str[1024];
uint32_t flags;
uint8_t id;
while (RTA_OK(attrs, len)) {
if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FAMILY)
memcpy(&family, RTA_DATA(attrs), 2);
if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR4) {
if (family != AF_INET)
error(1, errno, "wrong IP (v4) for family %d",
family);
inet_ntop(AF_INET, RTA_DATA(attrs), str, sizeof(str));
printf("%s ", str);
}
if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR6) {
if (family != AF_INET6)
error(1, errno, "wrong IP (v6) for family %d",
family);
inet_ntop(AF_INET6, RTA_DATA(attrs), str, sizeof(str));
printf("%s ", str);
}
if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ID) {
memcpy(&id, RTA_DATA(attrs), 1);
printf("id %d ", id);
}
if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FLAGS) {
memcpy(&flags, RTA_DATA(attrs), 4);
printf("flags ");
if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
printf("signal");
flags &= ~MPTCP_PM_ADDR_FLAG_SIGNAL;
if (flags)
printf(",");
}
if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
printf("subflow");
flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW;
if (flags)
printf(",");
}
if (flags & MPTCP_PM_ADDR_FLAG_BACKUP) {
printf("backup");
flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
if (flags)
printf(",");
}
/* bump unknown flags, if any */
if (flags)
printf("0x%x", flags);
printf(" ");
}
if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_IF_IDX) {
char name[IF_NAMESIZE], *ret;
int32_t ifindex;
memcpy(&ifindex, RTA_DATA(attrs), 4);
ret = if_indextoname(ifindex, name);
if (ret)
printf("dev %s ", ret);
else
printf("dev unknown/%d", ifindex);
}
attrs = RTA_NEXT(attrs, len);
}
printf("\n");
}
static void print_addrs(struct nlmsghdr *nh, int pm_family, int total_len)
{
struct rtattr *attrs;
for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) {
int len = nh->nlmsg_len;
if (nh->nlmsg_type == NLMSG_DONE)
break;
if (nh->nlmsg_type == NLMSG_ERROR)
nl_error(nh);
if (nh->nlmsg_type != pm_family)
continue;
len -= NLMSG_LENGTH(GENL_HDRLEN);
attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) +
GENL_HDRLEN);
while (RTA_OK(attrs, len)) {
if (attrs->rta_type ==
(MPTCP_PM_ATTR_ADDR | NLA_F_NESTED))
print_addr((void *)RTA_DATA(attrs),
attrs->rta_len);
attrs = RTA_NEXT(attrs, len);
}
}
}
int get_addr(int fd, int pm_family, int argc, char *argv[])
{
char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1024];
struct rtattr *rta, *nest;
struct nlmsghdr *nh;
int nest_start;
u_int8_t id;
int off = 0;
memset(data, 0, sizeof(data));
nh = (void *)data;
off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR,
MPTCP_PM_VER);
/* the only argument is the address id */
if (argc != 3)
syntax(argv);
id = atoi(argv[2]);
nest_start = off;
nest = (void *)(data + off);
nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
nest->rta_len = RTA_LENGTH(0);
off += NLMSG_ALIGN(nest->rta_len);
/* build a dummy addr with only the ID set */
rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
rta->rta_len = RTA_LENGTH(1);
memcpy(RTA_DATA(rta), &id, 1);
off += NLMSG_ALIGN(rta->rta_len);
nest->rta_len = off - nest_start;
print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data)));
return 0;
}
int dump_addrs(int fd, int pm_family, int argc, char *argv[])
{
char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1024];
pid_t pid = getpid();
struct nlmsghdr *nh;
int off = 0;
memset(data, 0, sizeof(data));
nh = (void *)data;
off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR,
MPTCP_PM_VER);
nh->nlmsg_flags |= NLM_F_DUMP;
nh->nlmsg_seq = 1;
nh->nlmsg_pid = pid;
nh->nlmsg_len = off;
print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data)));
return 0;
}
int flush_addrs(int fd, int pm_family, int argc, char *argv[])
{
char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1024];
struct nlmsghdr *nh;
int off = 0;
memset(data, 0, sizeof(data));
nh = (void *)data;
off = init_genl_req(data, pm_family, MPTCP_PM_CMD_FLUSH_ADDRS,
MPTCP_PM_VER);
do_nl_req(fd, nh, off, 0);
return 0;
}
static void print_limits(struct nlmsghdr *nh, int pm_family, int total_len)
{
struct rtattr *attrs;
uint32_t max;
for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) {
int len = nh->nlmsg_len;
if (nh->nlmsg_type == NLMSG_DONE)
break;
if (nh->nlmsg_type == NLMSG_ERROR)
nl_error(nh);
if (nh->nlmsg_type != pm_family)
continue;
len -= NLMSG_LENGTH(GENL_HDRLEN);
attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) +
GENL_HDRLEN);
while (RTA_OK(attrs, len)) {
int type = attrs->rta_type;
if (type != MPTCP_PM_ATTR_RCV_ADD_ADDRS &&
type != MPTCP_PM_ATTR_SUBFLOWS)
goto next;
memcpy(&max, RTA_DATA(attrs), 4);
printf("%s %u\n", type == MPTCP_PM_ATTR_SUBFLOWS ?
"subflows" : "accept", max);
next:
attrs = RTA_NEXT(attrs, len);
}
}
}
int get_set_limits(int fd, int pm_family, int argc, char *argv[])
{
char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1024];
uint32_t rcv_addr = 0, subflows = 0;
int cmd, len = sizeof(data);
struct nlmsghdr *nh;
int off = 0;
/* limit */
if (argc == 4) {
rcv_addr = atoi(argv[2]);
subflows = atoi(argv[3]);
cmd = MPTCP_PM_CMD_SET_LIMITS;
} else {
cmd = MPTCP_PM_CMD_GET_LIMITS;
}
memset(data, 0, sizeof(data));
nh = (void *)data;
off = init_genl_req(data, pm_family, cmd, MPTCP_PM_VER);
/* limit */
if (cmd == MPTCP_PM_CMD_SET_LIMITS) {
struct rtattr *rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ATTR_RCV_ADD_ADDRS;
rta->rta_len = RTA_LENGTH(4);
memcpy(RTA_DATA(rta), &rcv_addr, 4);
off += NLMSG_ALIGN(rta->rta_len);
rta = (void *)(data + off);
rta->rta_type = MPTCP_PM_ATTR_SUBFLOWS;
rta->rta_len = RTA_LENGTH(4);
memcpy(RTA_DATA(rta), &subflows, 4);
off += NLMSG_ALIGN(rta->rta_len);
/* do not expect a reply */
len = 0;
}
len = do_nl_req(fd, nh, off, len);
if (cmd == MPTCP_PM_CMD_GET_LIMITS)
print_limits(nh, pm_family, len);
return 0;
}
int main(int argc, char *argv[])
{
int fd, pm_family;
if (argc < 2)
syntax(argv);
fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
if (fd == -1)
error(1, errno, "socket netlink");
pm_family = resolve_mptcp_pm_netlink(fd);
if (!strcmp(argv[1], "add"))
return add_addr(fd, pm_family, argc, argv);
else if (!strcmp(argv[1], "del"))
return del_addr(fd, pm_family, argc, argv);
else if (!strcmp(argv[1], "flush"))
return flush_addrs(fd, pm_family, argc, argv);
else if (!strcmp(argv[1], "get"))
return get_addr(fd, pm_family, argc, argv);
else if (!strcmp(argv[1], "dump"))
return dump_addrs(fd, pm_family, argc, argv);
else if (!strcmp(argv[1], "limits"))
return get_set_limits(fd, pm_family, argc, argv);
fprintf(stderr, "unknown sub-command: %s", argv[1]);
syntax(argv);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment