Commit ebf32282 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-another-set-of-miscellaneous-mptcp-fixes'

Mat Martineau says:

====================
mptcp: Another set of miscellaneous MPTCP fixes

This is another collection of MPTCP fixes and enhancements that we have
tested in the MPTCP tree:

Patch 1 cleans up cgroup attachment for in-kernel subflow sockets.

Patches 2 and 3 make sure that deletion of advertised addresses by an
MPTCP path manager when flushing all addresses behaves similarly to the
remove-single-address operation, and adds related tests.

Patches 4 and 8 do some minor cleanup.

Patches 5-7 add MPTCP_FASTCLOSE functionality. Note that patch 6 adds MPTCP
option parsing to tcp_reset().

Patch 9 optimizes skb size for outgoing MPTCP packets.
====================

Link: https://lore.kernel.org/r/20201210222506.222251-1-mathew.j.martineau@linux.intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents efc36d3c 15e6ca97
......@@ -611,7 +611,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk);
void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
......
......@@ -4218,10 +4218,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
}
/* When we get a reset we do this. */
void tcp_reset(struct sock *sk)
void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
trace_tcp_receive_reset(sk);
if (sk_is_mptcp(sk))
mptcp_incoming_options(sk, skb);
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
......@@ -5604,7 +5607,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
} else if (tcp_reset_check(sk, skb)) {
tcp_reset(sk);
tcp_reset(sk, skb);
}
goto discard;
}
......@@ -5640,7 +5643,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
}
if (rst_seq_match)
tcp_reset(sk);
tcp_reset(sk, skb);
else {
/* Disable TFO if RST is out-of-order
* and no data has been received
......@@ -6077,7 +6080,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (th->rst) {
tcp_reset(sk);
tcp_reset(sk, skb);
goto discard;
}
......@@ -6519,7 +6522,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
tcp_reset(sk, skb);
return 1;
}
}
......
......@@ -801,7 +801,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
req->rsk_ops->send_reset(sk, skb);
} else if (fastopen) { /* received a valid RST pkt */
reqsk_fastopen_remove(sk, req, true);
tcp_reset(sk);
tcp_reset(sk, skb);
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req);
......
......@@ -282,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb,
pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
break;
case MPTCPOPT_MP_FASTCLOSE:
if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
break;
ptr += 2;
mp_opt->rcvr_key = get_unaligned_be64(ptr);
ptr += 8;
mp_opt->fastclose = 1;
break;
default:
break;
}
......@@ -299,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb,
mp_opt->mp_join = 0;
mp_opt->add_addr = 0;
mp_opt->ahmac = 0;
mp_opt->fastclose = 0;
mp_opt->port = 0;
mp_opt->rm_addr = 0;
mp_opt->dss = 0;
......@@ -942,6 +953,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return;
if (mp_opt.fastclose &&
msk->local_key == mp_opt.rcvr_key) {
WRITE_ONCE(msk->rcv_fastclose, true);
mptcp_schedule_work((struct sock *)msk);
}
if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
struct mptcp_addr_info addr;
......
......@@ -135,7 +135,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry, *ret = NULL;
rcu_read_lock();
spin_lock_bh(&msk->join_list_lock);
__mptcp_flush_join_list(msk);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
......@@ -144,13 +144,11 @@ select_local_address(const struct pm_nl_pernet *pernet,
* pending join
*/
if (entry->addr.family == ((struct sock *)msk)->sk_family &&
!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
!lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) {
!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
ret = entry;
break;
}
}
spin_unlock_bh(&msk->join_list_lock);
rcu_read_unlock();
return ret;
}
......@@ -867,13 +865,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
return ret;
}
static void __flush_addrs(struct pm_nl_pernet *pernet)
static void __flush_addrs(struct net *net, struct list_head *list)
{
while (!list_empty(&pernet->local_addr_list)) {
while (!list_empty(list)) {
struct mptcp_pm_addr_entry *cur;
cur = list_entry(pernet->local_addr_list.next,
cur = list_entry(list->next,
struct mptcp_pm_addr_entry, list);
mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr);
list_del_rcu(&cur->list);
kfree_rcu(cur, rcu);
}
......@@ -890,11 +889,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet)
static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
LIST_HEAD(free_list);
spin_lock_bh(&pernet->lock);
__flush_addrs(pernet);
list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet);
spin_unlock_bh(&pernet->lock);
__flush_addrs(sock_net(skb->sk), &free_list);
return 0;
}
......@@ -1156,10 +1157,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list)
struct net *net;
list_for_each_entry(net, net_list, exit_list) {
struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
/* net is removed from namespace list, can't race with
* other modifiers
*/
__flush_addrs(net_generic(net, pm_nl_pernet_id));
__flush_addrs(net, &pernet->local_addr_list);
}
}
......
......@@ -1256,6 +1256,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_ext *mpext = NULL;
struct sk_buff *skb, *tail;
bool can_collapse = false;
int size_bias = 0;
int avail_size;
size_t ret = 0;
......@@ -1277,10 +1278,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
can_collapse = (info->size_goal - skb->len > 0) &&
mptcp_skb_can_collapse_to(data_seq, skb, mpext);
if (!can_collapse)
if (!can_collapse) {
TCP_SKB_CB(skb)->eor = 1;
else
} else {
size_bias = skb->len;
avail_size = info->size_goal - skb->len;
}
}
/* Zero window and all data acked? Probe. */
......@@ -1300,8 +1303,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
return 0;
ret = info->limit - info->sent;
tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page,
dfrag->offset + info->sent, &ret);
tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
dfrag->page, dfrag->offset + info->sent, &ret);
if (!tail) {
tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
return -ENOMEM;
......@@ -1310,8 +1313,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
/* if the tail skb is still the cached one, collapsing really happened.
*/
if (skb == tail) {
WARN_ON_ONCE(!can_collapse);
TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += ret;
WARN_ON_ONCE(!can_collapse);
WARN_ON_ONCE(zero_window_probe);
goto out;
}
......@@ -2217,6 +2221,36 @@ static bool mptcp_check_close_timeout(const struct sock *sk)
return true;
}
static void mptcp_check_fastclose(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = &msk->sk.icsk_inet.sk;
if (likely(!READ_ONCE(msk->rcv_fastclose)))
return;
mptcp_token_destroy(msk);
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
lock_sock(tcp_sk);
if (tcp_sk->sk_state != TCP_CLOSE) {
tcp_send_active_reset(tcp_sk, GFP_ATOMIC);
tcp_set_state(tcp_sk, TCP_CLOSE);
}
release_sock(tcp_sk);
}
inet_sk_state_store(sk, TCP_CLOSE);
sk->sk_shutdown = SHUTDOWN_MASK;
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
set_bit(MPTCP_DATA_READY, &msk->flags);
set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
mptcp_close_wake_up(sk);
}
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
......@@ -2233,6 +2267,9 @@ static void mptcp_worker(struct work_struct *work)
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
mptcp_check_fastclose(msk);
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
__mptcp_close_subflow(msk);
......
......@@ -23,6 +23,7 @@
#define OPTION_MPTCP_ADD_ADDR BIT(6)
#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
#define OPTION_MPTCP_RM_ADDR BIT(8)
#define OPTION_MPTCP_FASTCLOSE BIT(9)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
......@@ -58,6 +59,7 @@
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24
#define TCPOLEN_MPTCP_PORT_LEN 4
#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
#define TCPOLEN_MPTCP_FASTCLOSE 12
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
......@@ -110,6 +112,7 @@ struct mptcp_options_received {
u16 data_len;
u16 mp_capable : 1,
mp_join : 1,
fastclose : 1,
dss : 1,
add_addr : 1,
rm_addr : 1,
......@@ -119,7 +122,7 @@ struct mptcp_options_received {
u32 token;
u32 nonce;
u64 thmac;
u8 hmac[20];
u8 hmac[MPTCPOPT_HMAC_LEN];
u8 join_id;
u8 use_map:1,
dsn64:1,
......@@ -237,6 +240,7 @@ struct mptcp_sock {
bool fully_established;
bool rcv_data_fin;
bool snd_data_fin_enable;
bool rcv_fastclose;
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
spinlock_t join_list_lock;
struct sock *ack_hint;
......
......@@ -313,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk);
return; /* worker will put sk for us */
sock_put(sk);
}
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
......@@ -1167,6 +1172,30 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
return err;
}
static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
*child_skcd = &child->sk_cgrp_data;
/* only the additional subflows created by kworkers have to be modified */
if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
cgroup_id(sock_cgroup_ptr(child_skcd))) {
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg = parent->sk_memcg;
mem_cgroup_sk_free(child);
if (memcg && css_tryget(&memcg->css))
child->sk_memcg = memcg;
#endif /* CONFIG_MEMCG */
cgroup_sk_free(child_skcd);
*child_skcd = *parent_skcd;
cgroup_sk_clone(child_skcd);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
}
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
......@@ -1187,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
lock_sock(sf->sk);
/* the newly created socket has to be in the same cgroup as its parent */
mptcp_attach_cgroup(sk, sf->sk);
/* kernel sockets do not by default acquire net ref, but TCP timer
* needs it.
*/
......
......@@ -264,27 +264,37 @@ do_transfer()
cpid=$!
if [ $rm_nr_ns1 -gt 0 ]; then
counter=1
sleep 1
if [ $rm_nr_ns1 -lt 8 ]; then
counter=1
sleep 1
while [ $counter -le $rm_nr_ns1 ]
do
ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
while [ $counter -le $rm_nr_ns1 ]
do
ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
sleep 1
let counter+=1
done
else
sleep 1
let counter+=1
done
ip netns exec ${listener_ns} ./pm_nl_ctl flush
fi
fi
if [ $rm_nr_ns2 -gt 0 ]; then
counter=1
sleep 1
if [ $rm_nr_ns2 -lt 8 ]; then
counter=1
sleep 1
while [ $counter -le $rm_nr_ns2 ]
do
ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
while [ $counter -le $rm_nr_ns2 ]
do
ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
sleep 1
let counter+=1
done
else
sleep 1
let counter+=1
done
ip netns exec ${connector_ns} ./pm_nl_ctl flush
fi
fi
wait $cpid
......@@ -663,6 +673,18 @@ chk_join_nr "remove subflows and signal" 3 3 3
chk_add_nr 1 1
chk_rm_nr 2 2
# subflows and signal, flush
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 3
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 3
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1 0 8 8 slow
chk_join_nr "flush subflows and signal" 3 3 3
chk_add_nr 1 1
chk_rm_nr 2 2
# subflow IPv6
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment