Commit 124de271 authored by David S. Miller's avatar David S. Miller

Merge branch 'mptcp-MP_FAIL-timeout'

Mat Martineau says:

====================
mptcp: Timeout for MP_FAIL response

When one peer sends an infinite mapping to coordinate fallback from
MPTCP to regular TCP, the other peer is expected to send a packet with
the MPTCP MP_FAIL option to acknowledge the infinite mapping. Rather
than leave the connection in some half-fallback state, this series adds
a timeout after which the infinite mapping sender will reset the
connection.

Patch 1 adds a fallback self test.

Patches 2-5 make use of the MPTCP socket's retransmit timer to reset the
MPTCP connection if no MP_FAIL was received.

Patches 6 and 7 extends the self test to check MP_FAIL-related MIBs.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b1190d51 53f368bf
......@@ -287,11 +287,27 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *s = (struct sock *)msk;
pr_debug("fail_seq=%llu", fail_seq);
if (!mptcp_has_another_subflow(sk) && READ_ONCE(msk->allow_infinite_fallback))
if (mptcp_has_another_subflow(sk) || !READ_ONCE(msk->allow_infinite_fallback))
return;
if (!READ_ONCE(subflow->mp_fail_response_expect)) {
pr_debug("send MP_FAIL response and infinite map");
subflow->send_mp_fail = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
subflow->send_infinite_map = 1;
} else if (s && inet_sk_state_load(s) != TCP_CLOSE) {
pr_debug("MP_FAIL response received");
mptcp_data_lock(s);
if (inet_sk_state_load(s) != TCP_CLOSE)
sk_stop_timer(s, &s->sk_timer);
mptcp_data_unlock(s);
}
}
/* path manager helpers */
......
......@@ -1605,8 +1605,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
out:
/* ensure the rtx timer is running */
mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
mptcp_data_unlock(sk);
if (copied)
__mptcp_check_send_data_fin(sk);
}
......@@ -2167,10 +2169,38 @@ static void mptcp_retransmit_timer(struct timer_list *t)
sock_put(sk);
}
static struct mptcp_subflow_context *
mp_fail_response_expect_subflow(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *ret = NULL;
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->mp_fail_response_expect)) {
ret = subflow;
break;
}
}
return ret;
}
static void mptcp_check_mp_fail_response(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
bh_lock_sock(sk);
subflow = mp_fail_response_expect_subflow(msk);
if (subflow)
__set_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags);
bh_unlock_sock(sk);
}
static void mptcp_timeout_timer(struct timer_list *t)
{
struct sock *sk = from_timer(sk, t, sk_timer);
mptcp_check_mp_fail_response(mptcp_sk(sk));
mptcp_schedule_work(sk);
sock_put(sk);
}
......@@ -2491,8 +2521,27 @@ static void __mptcp_retrans(struct sock *sk)
reset_timer:
mptcp_check_and_set_pending(sk);
mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
mptcp_data_unlock(sk);
}
static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *ssk;
bool slow;
subflow = mp_fail_response_expect_subflow(msk);
if (subflow) {
pr_debug("MP_FAIL doesn't respond, reset the subflow");
ssk = mptcp_subflow_tcp_sock(subflow);
slow = lock_sock_fast(ssk);
mptcp_subflow_reset(ssk);
unlock_sock_fast(ssk, slow);
}
}
static void mptcp_worker(struct work_struct *work)
......@@ -2535,6 +2584,9 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
if (test_and_clear_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags))
mptcp_mp_fail_no_response(msk);
unlock:
release_sock(sk);
sock_put(sk);
......@@ -2651,8 +2703,10 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
tcp_send_ack(ssk);
mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
mptcp_data_unlock(sk);
}
break;
}
......@@ -2753,8 +2807,10 @@ static void __mptcp_destroy_sock(struct sock *sk)
/* join list will be eventually flushed (with rst) at sock lock release time*/
list_splice_init(&msk->conn_list, &conn_list);
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
mptcp_data_lock(sk);
mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
mptcp_data_unlock(sk);
msk->pm.status = 0;
/* clears msk->subflow, allowing the following loop to close
......@@ -2816,7 +2872,9 @@ static void mptcp_close(struct sock *sk, long timeout)
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
mptcp_data_lock(sk);
sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
mptcp_data_unlock(sk);
}
release_sock(sk);
if (do_cancel_work)
......@@ -2861,8 +2919,10 @@ static int mptcp_disconnect(struct sock *sk, int flags)
__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE);
}
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
mptcp_data_lock(sk);
mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
mptcp_data_unlock(sk);
if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
......
......@@ -116,6 +116,7 @@
#define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5
#define MPTCP_FAIL_NO_RESPONSE 6
/* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING 1
......@@ -448,6 +449,7 @@ struct mptcp_subflow_context {
stale : 1, /* unable to snd/rcv data, do not use for xmit */
local_id_valid : 1; /* local_id is correctly initialized */
enum mptcp_data_avail data_avail;
bool mp_fail_response_expect;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
......
......@@ -968,6 +968,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool csum_reqd = READ_ONCE(msk->csum_enabled);
struct sock *sk = (struct sock *)msk;
struct mptcp_ext *mpext;
struct sk_buff *skb;
u16 data_len;
......@@ -1009,6 +1010,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
pr_debug("infinite mapping received");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
subflow->map_data_len = 0;
if (sk && inet_sk_state_load(sk) != TCP_CLOSE) {
mptcp_data_lock(sk);
if (inet_sk_state_load(sk) != TCP_CLOSE)
sk_stop_timer(sk, &sk->sk_timer);
mptcp_data_unlock(sk);
}
return MAPPING_INVALID;
}
......@@ -1217,6 +1224,12 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_send_active_reset(ssk, GFP_ATOMIC);
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
} else {
WRITE_ONCE(subflow->mp_fail_response_expect, true);
/* The data lock is acquired in __mptcp_move_skbs() */
sk_reset_timer((struct sock *)msk,
&((struct sock *)msk)->sk_timer,
jiffies + TCP_RTO_MAX);
}
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return true;
......
......@@ -12,6 +12,9 @@ CONFIG_NF_TABLES=m
CONFIG_NFT_COMPAT=m
CONFIG_NETFILTER_XTABLES=m
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NF_TABLES_INET=y
CONFIG_NFT_TPROXY=m
CONFIG_NFT_SOCKET=m
......@@ -19,3 +22,8 @@ CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_NET_ACT_CSUM=m
CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_CLS_ACT=y
CONFIG_NET_CLS_FW=m
CONFIG_NET_SCH_INGRESS=m
......@@ -266,6 +266,58 @@ reset_with_allow_join_id0()
ip netns exec $ns2 sysctl -q net.mptcp.allow_join_initial_addr_port=$ns2_enable
}
# Modify TCP payload without corrupting the TCP packet
#
# This rule inverts a 8-bit word at byte offset 148 for the 2nd TCP ACK packets
# carrying enough data.
# Once it is done, the TCP Checksum field is updated so the packet is still
# considered as valid at the TCP level.
# Because the MPTCP checksum, covering the TCP options and data, has not been
# updated, the modification will be detected and an MP_FAIL will be emitted:
# what we want to validate here without corrupting "random" MPTCP options.
#
# To avoid having tc producing this pr_info() message for each TCP ACK packets
# not carrying enough data:
#
# tc action pedit offset 162 out of bounds
#
# Netfilter is used to mark packets with enough data.
reset_with_fail()
{
reset "${1}" || return 1
ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=1
ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=1
check_invert=1
validate_checksum=1
local i="$2"
local ip="${3:-4}"
local tables
tables="iptables"
if [ $ip -eq 6 ]; then
tables="ip6tables"
fi
ip netns exec $ns2 $tables \
-t mangle \
-A OUTPUT \
-o ns2eth$i \
-p tcp \
-m length --length 150:9999 \
-m statistic --mode nth --packet 1 --every 99999 \
-j MARK --set-mark 42 || exit 1
tc -n $ns2 qdisc add dev ns2eth$i clsact || exit 1
tc -n $ns2 filter add dev ns2eth$i egress \
protocol ip prio 1000 \
handle 42 fw \
action pedit munge offset 148 u8 invert \
pipe csum tcp \
index 100 || exit 1
}
fail_test()
{
ret=1
......@@ -961,6 +1013,7 @@ chk_csum_nr()
local csum_ns2=${2:-0}
local count
local dump_stats
local extra_msg=""
local allow_multi_errors_ns1=0
local allow_multi_errors_ns2=0
......@@ -976,6 +1029,9 @@ chk_csum_nr()
printf "%-${nr_blank}s %s" " " "sum"
count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$csum_ns1" ]; then
extra_msg="$extra_msg ns1=$count"
fi
if { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } ||
{ [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then
echo "[fail] got $count data checksum error[s] expected $csum_ns1"
......@@ -987,28 +1043,58 @@ chk_csum_nr()
echo -n " - csum "
count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$csum_ns2" ]; then
extra_msg="$extra_msg ns2=$count"
fi
if { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } ||
{ [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then
echo "[fail] got $count data checksum error[s] expected $csum_ns2"
fail_test
dump_stats=1
else
echo "[ ok ]"
echo -n "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
echo "$extra_msg"
}
chk_fail_nr()
{
local fail_tx=$1
local fail_rx=$2
local ns_invert=${3:-""}
local count
local dump_stats
local ns_tx=$ns1
local ns_rx=$ns2
local extra_msg=""
local allow_tx_lost=0
local allow_rx_lost=0
if [[ $ns_invert = "invert" ]]; then
ns_tx=$ns2
ns_rx=$ns1
extra_msg=" invert"
fi
if [[ "${fail_tx}" = "-"* ]]; then
allow_tx_lost=1
fail_tx=${fail_tx:1}
fi
if [[ "${fail_rx}" = "-"* ]]; then
allow_rx_lost=1
fail_rx=${fail_rx:1}
fi
printf "%-${nr_blank}s %s" " " "ftx"
count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}')
count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$fail_tx" ]; then
extra_msg="$extra_msg,tx=$count"
fi
if { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } ||
{ [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then
echo "[fail] got $count MP_FAIL[s] TX expected $fail_tx"
fail_test
dump_stats=1
......@@ -1017,17 +1103,23 @@ chk_fail_nr()
fi
echo -n " - failrx"
count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}')
count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$fail_rx" ]; then
extra_msg="$extra_msg,rx=$count"
fi
if { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } ||
{ [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then
echo "[fail] got $count MP_FAIL[s] RX expected $fail_rx"
fail_test
dump_stats=1
else
echo "[ ok ]"
echo -n "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
echo "$extra_msg"
}
chk_fclose_nr()
......@@ -1199,7 +1291,7 @@ chk_join_nr()
echo "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
if [ $checksum -eq 1 ]; then
if [ $validate_checksum -eq 1 ]; then
chk_csum_nr $csum_ns1 $csum_ns2
chk_fail_nr $fail_nr $fail_nr
chk_rst_nr $rst_nr $rst_nr
......@@ -2590,6 +2682,22 @@ fastclose_tests()
fi
}
pedit_action_pkts()
{
tc -n $ns2 -j -s action show action pedit index 100 | \
sed 's/.*"packets":\([0-9]\+\),.*/\1/'
}
fail_tests()
{
# single subflow
if reset_with_fail "Infinite map" 1; then
run_tests $ns1 $ns2 10.0.1.1 128
chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)"
chk_fail_nr 1 -1 invert
fi
}
implicit_tests()
{
# userspace pm type prevents add_addr
......@@ -2658,6 +2766,7 @@ all_tests_sorted=(
d@deny_join_id0_tests
m@fullmesh_tests
z@fastclose_tests
F@fail_tests
I@implicit_tests
)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment