Merge branch 'mptcp-MP_FAIL-timeout'

Mat Martineau says:

====================
mptcp: Timeout for MP_FAIL response

When one peer sends an infinite mapping to coordinate fallback from
MPTCP to regular TCP, the other peer is expected to send a packet with
the MPTCP MP_FAIL option to acknowledge the infinite mapping. Rather
than leave the connection in some half-fallback state, this series adds
a timeout after which the infinite mapping sender will reset the
connection.

Patch 1 adds a fallback self test.

Patches 2-5 make use of the MPTCP socket's retransmit timer to reset the
MPTCP connection if no MP_FAIL was received.

Patches 6 and 7 extends the self test to check MP_FAIL-related MIBs.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2022-04-27 10:45:55 +01:00
commit 124de27101
6 changed files with 216 additions and 8 deletions

View File

@ -287,11 +287,27 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *s = (struct sock *)msk;
pr_debug("fail_seq=%llu", fail_seq); pr_debug("fail_seq=%llu", fail_seq);
if (!mptcp_has_another_subflow(sk) && READ_ONCE(msk->allow_infinite_fallback)) if (mptcp_has_another_subflow(sk) || !READ_ONCE(msk->allow_infinite_fallback))
return;
if (!READ_ONCE(subflow->mp_fail_response_expect)) {
pr_debug("send MP_FAIL response and infinite map");
subflow->send_mp_fail = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
subflow->send_infinite_map = 1; subflow->send_infinite_map = 1;
} else if (s && inet_sk_state_load(s) != TCP_CLOSE) {
pr_debug("MP_FAIL response received");
mptcp_data_lock(s);
if (inet_sk_state_load(s) != TCP_CLOSE)
sk_stop_timer(s, &s->sk_timer);
mptcp_data_unlock(s);
}
} }
/* path manager helpers */ /* path manager helpers */

View File

@ -1605,8 +1605,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
out: out:
/* ensure the rtx timer is running */ /* ensure the rtx timer is running */
mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk)) if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
mptcp_data_unlock(sk);
if (copied) if (copied)
__mptcp_check_send_data_fin(sk); __mptcp_check_send_data_fin(sk);
} }
@ -2167,10 +2169,38 @@ static void mptcp_retransmit_timer(struct timer_list *t)
sock_put(sk); sock_put(sk);
} }
static struct mptcp_subflow_context *
mp_fail_response_expect_subflow(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *ret = NULL;
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->mp_fail_response_expect)) {
ret = subflow;
break;
}
}
return ret;
}
static void mptcp_check_mp_fail_response(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
bh_lock_sock(sk);
subflow = mp_fail_response_expect_subflow(msk);
if (subflow)
__set_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags);
bh_unlock_sock(sk);
}
static void mptcp_timeout_timer(struct timer_list *t) static void mptcp_timeout_timer(struct timer_list *t)
{ {
struct sock *sk = from_timer(sk, t, sk_timer); struct sock *sk = from_timer(sk, t, sk_timer);
mptcp_check_mp_fail_response(mptcp_sk(sk));
mptcp_schedule_work(sk); mptcp_schedule_work(sk);
sock_put(sk); sock_put(sk);
} }
@ -2491,8 +2521,27 @@ static void __mptcp_retrans(struct sock *sk)
reset_timer: reset_timer:
mptcp_check_and_set_pending(sk); mptcp_check_and_set_pending(sk);
mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk)) if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
mptcp_data_unlock(sk);
}
static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *ssk;
bool slow;
subflow = mp_fail_response_expect_subflow(msk);
if (subflow) {
pr_debug("MP_FAIL doesn't respond, reset the subflow");
ssk = mptcp_subflow_tcp_sock(subflow);
slow = lock_sock_fast(ssk);
mptcp_subflow_reset(ssk);
unlock_sock_fast(ssk, slow);
}
} }
static void mptcp_worker(struct work_struct *work) static void mptcp_worker(struct work_struct *work)
@ -2535,6 +2584,9 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk); __mptcp_retrans(sk);
if (test_and_clear_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags))
mptcp_mp_fail_no_response(msk);
unlock: unlock:
release_sock(sk); release_sock(sk);
sock_put(sk); sock_put(sk);
@ -2651,8 +2703,10 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
} else { } else {
pr_debug("Sending DATA_FIN on subflow %p", ssk); pr_debug("Sending DATA_FIN on subflow %p", ssk);
tcp_send_ack(ssk); tcp_send_ack(ssk);
mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk)) if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
mptcp_data_unlock(sk);
} }
break; break;
} }
@ -2753,8 +2807,10 @@ static void __mptcp_destroy_sock(struct sock *sk)
/* join list will be eventually flushed (with rst) at sock lock release time*/ /* join list will be eventually flushed (with rst) at sock lock release time*/
list_splice_init(&msk->conn_list, &conn_list); list_splice_init(&msk->conn_list, &conn_list);
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); mptcp_data_lock(sk);
mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer); sk_stop_timer(sk, &sk->sk_timer);
mptcp_data_unlock(sk);
msk->pm.status = 0; msk->pm.status = 0;
/* clears msk->subflow, allowing the following loop to close /* clears msk->subflow, allowing the following loop to close
@ -2816,7 +2872,9 @@ cleanup:
__mptcp_destroy_sock(sk); __mptcp_destroy_sock(sk);
do_cancel_work = true; do_cancel_work = true;
} else { } else {
mptcp_data_lock(sk);
sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
mptcp_data_unlock(sk);
} }
release_sock(sk); release_sock(sk);
if (do_cancel_work) if (do_cancel_work)
@ -2861,8 +2919,10 @@ static int mptcp_disconnect(struct sock *sk, int flags)
__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE); __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE);
} }
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); mptcp_data_lock(sk);
mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer); sk_stop_timer(sk, &sk->sk_timer);
mptcp_data_unlock(sk);
if (mptcp_sk(sk)->token) if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);

View File

@ -116,6 +116,7 @@
#define MPTCP_WORK_EOF 3 #define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4 #define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5 #define MPTCP_WORK_CLOSE_SUBFLOW 5
#define MPTCP_FAIL_NO_RESPONSE 6
/* MPTCP socket release cb flags */ /* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING 1 #define MPTCP_PUSH_PENDING 1
@ -448,6 +449,7 @@ struct mptcp_subflow_context {
stale : 1, /* unable to snd/rcv data, do not use for xmit */ stale : 1, /* unable to snd/rcv data, do not use for xmit */
local_id_valid : 1; /* local_id is correctly initialized */ local_id_valid : 1; /* local_id is correctly initialized */
enum mptcp_data_avail data_avail; enum mptcp_data_avail data_avail;
bool mp_fail_response_expect;
u32 remote_nonce; u32 remote_nonce;
u64 thmac; u64 thmac;
u32 local_nonce; u32 local_nonce;

View File

@ -968,6 +968,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool csum_reqd = READ_ONCE(msk->csum_enabled); bool csum_reqd = READ_ONCE(msk->csum_enabled);
struct sock *sk = (struct sock *)msk;
struct mptcp_ext *mpext; struct mptcp_ext *mpext;
struct sk_buff *skb; struct sk_buff *skb;
u16 data_len; u16 data_len;
@ -1009,6 +1010,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
pr_debug("infinite mapping received"); pr_debug("infinite mapping received");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
subflow->map_data_len = 0; subflow->map_data_len = 0;
if (sk && inet_sk_state_load(sk) != TCP_CLOSE) {
mptcp_data_lock(sk);
if (inet_sk_state_load(sk) != TCP_CLOSE)
sk_stop_timer(sk, &sk->sk_timer);
mptcp_data_unlock(sk);
}
return MAPPING_INVALID; return MAPPING_INVALID;
} }
@ -1217,6 +1224,12 @@ fallback:
tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_send_active_reset(ssk, GFP_ATOMIC);
while ((skb = skb_peek(&ssk->sk_receive_queue))) while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb); sk_eat_skb(ssk, skb);
} else {
WRITE_ONCE(subflow->mp_fail_response_expect, true);
/* The data lock is acquired in __mptcp_move_skbs() */
sk_reset_timer((struct sock *)msk,
&((struct sock *)msk)->sk_timer,
jiffies + TCP_RTO_MAX);
} }
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return true; return true;

View File

@ -12,6 +12,9 @@ CONFIG_NF_TABLES=m
CONFIG_NFT_COMPAT=m CONFIG_NFT_COMPAT=m
CONFIG_NETFILTER_XTABLES=m CONFIG_NETFILTER_XTABLES=m
CONFIG_NETFILTER_XT_MATCH_BPF=m CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_INET=y
CONFIG_NFT_TPROXY=m CONFIG_NFT_TPROXY=m
CONFIG_NFT_SOCKET=m CONFIG_NFT_SOCKET=m
@ -19,3 +22,8 @@ CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_NET_ACT_CSUM=m
CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_CLS_ACT=y
CONFIG_NET_CLS_FW=m
CONFIG_NET_SCH_INGRESS=m

View File

@ -266,6 +266,58 @@ reset_with_allow_join_id0()
ip netns exec $ns2 sysctl -q net.mptcp.allow_join_initial_addr_port=$ns2_enable ip netns exec $ns2 sysctl -q net.mptcp.allow_join_initial_addr_port=$ns2_enable
} }
# Modify TCP payload without corrupting the TCP packet
#
# This rule inverts a 8-bit word at byte offset 148 for the 2nd TCP ACK packets
# carrying enough data.
# Once it is done, the TCP Checksum field is updated so the packet is still
# considered as valid at the TCP level.
# Because the MPTCP checksum, covering the TCP options and data, has not been
# updated, the modification will be detected and an MP_FAIL will be emitted:
# what we want to validate here without corrupting "random" MPTCP options.
#
# To avoid having tc producing this pr_info() message for each TCP ACK packets
# not carrying enough data:
#
# tc action pedit offset 162 out of bounds
#
# Netfilter is used to mark packets with enough data.
reset_with_fail()
{
reset "${1}" || return 1
ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=1
ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=1
check_invert=1
validate_checksum=1
local i="$2"
local ip="${3:-4}"
local tables
tables="iptables"
if [ $ip -eq 6 ]; then
tables="ip6tables"
fi
ip netns exec $ns2 $tables \
-t mangle \
-A OUTPUT \
-o ns2eth$i \
-p tcp \
-m length --length 150:9999 \
-m statistic --mode nth --packet 1 --every 99999 \
-j MARK --set-mark 42 || exit 1
tc -n $ns2 qdisc add dev ns2eth$i clsact || exit 1
tc -n $ns2 filter add dev ns2eth$i egress \
protocol ip prio 1000 \
handle 42 fw \
action pedit munge offset 148 u8 invert \
pipe csum tcp \
index 100 || exit 1
}
fail_test() fail_test()
{ {
ret=1 ret=1
@ -961,6 +1013,7 @@ chk_csum_nr()
local csum_ns2=${2:-0} local csum_ns2=${2:-0}
local count local count
local dump_stats local dump_stats
local extra_msg=""
local allow_multi_errors_ns1=0 local allow_multi_errors_ns1=0
local allow_multi_errors_ns2=0 local allow_multi_errors_ns2=0
@ -976,6 +1029,9 @@ chk_csum_nr()
printf "%-${nr_blank}s %s" " " "sum" printf "%-${nr_blank}s %s" " " "sum"
count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}') count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}')
[ -z "$count" ] && count=0 [ -z "$count" ] && count=0
if [ "$count" != "$csum_ns1" ]; then
extra_msg="$extra_msg ns1=$count"
fi
if { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } || if { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } ||
{ [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then
echo "[fail] got $count data checksum error[s] expected $csum_ns1" echo "[fail] got $count data checksum error[s] expected $csum_ns1"
@ -987,28 +1043,58 @@ chk_csum_nr()
echo -n " - csum " echo -n " - csum "
count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}') count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}')
[ -z "$count" ] && count=0 [ -z "$count" ] && count=0
if [ "$count" != "$csum_ns2" ]; then
extra_msg="$extra_msg ns2=$count"
fi
if { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } || if { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } ||
{ [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then
echo "[fail] got $count data checksum error[s] expected $csum_ns2" echo "[fail] got $count data checksum error[s] expected $csum_ns2"
fail_test fail_test
dump_stats=1 dump_stats=1
else else
echo "[ ok ]" echo -n "[ ok ]"
fi fi
[ "${dump_stats}" = 1 ] && dump_stats [ "${dump_stats}" = 1 ] && dump_stats
echo "$extra_msg"
} }
chk_fail_nr() chk_fail_nr()
{ {
local fail_tx=$1 local fail_tx=$1
local fail_rx=$2 local fail_rx=$2
local ns_invert=${3:-""}
local count local count
local dump_stats local dump_stats
local ns_tx=$ns1
local ns_rx=$ns2
local extra_msg=""
local allow_tx_lost=0
local allow_rx_lost=0
if [[ $ns_invert = "invert" ]]; then
ns_tx=$ns2
ns_rx=$ns1
extra_msg=" invert"
fi
if [[ "${fail_tx}" = "-"* ]]; then
allow_tx_lost=1
fail_tx=${fail_tx:1}
fi
if [[ "${fail_rx}" = "-"* ]]; then
allow_rx_lost=1
fail_rx=${fail_rx:1}
fi
printf "%-${nr_blank}s %s" " " "ftx" printf "%-${nr_blank}s %s" " " "ftx"
count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}') count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}')
[ -z "$count" ] && count=0 [ -z "$count" ] && count=0
if [ "$count" != "$fail_tx" ]; then if [ "$count" != "$fail_tx" ]; then
extra_msg="$extra_msg,tx=$count"
fi
if { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } ||
{ [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then
echo "[fail] got $count MP_FAIL[s] TX expected $fail_tx" echo "[fail] got $count MP_FAIL[s] TX expected $fail_tx"
fail_test fail_test
dump_stats=1 dump_stats=1
@ -1017,17 +1103,23 @@ chk_fail_nr()
fi fi
echo -n " - failrx" echo -n " - failrx"
count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}') count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}')
[ -z "$count" ] && count=0 [ -z "$count" ] && count=0
if [ "$count" != "$fail_rx" ]; then if [ "$count" != "$fail_rx" ]; then
extra_msg="$extra_msg,rx=$count"
fi
if { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } ||
{ [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then
echo "[fail] got $count MP_FAIL[s] RX expected $fail_rx" echo "[fail] got $count MP_FAIL[s] RX expected $fail_rx"
fail_test fail_test
dump_stats=1 dump_stats=1
else else
echo "[ ok ]" echo -n "[ ok ]"
fi fi
[ "${dump_stats}" = 1 ] && dump_stats [ "${dump_stats}" = 1 ] && dump_stats
echo "$extra_msg"
} }
chk_fclose_nr() chk_fclose_nr()
@ -1199,7 +1291,7 @@ chk_join_nr()
echo "[ ok ]" echo "[ ok ]"
fi fi
[ "${dump_stats}" = 1 ] && dump_stats [ "${dump_stats}" = 1 ] && dump_stats
if [ $checksum -eq 1 ]; then if [ $validate_checksum -eq 1 ]; then
chk_csum_nr $csum_ns1 $csum_ns2 chk_csum_nr $csum_ns1 $csum_ns2
chk_fail_nr $fail_nr $fail_nr chk_fail_nr $fail_nr $fail_nr
chk_rst_nr $rst_nr $rst_nr chk_rst_nr $rst_nr $rst_nr
@ -2590,6 +2682,22 @@ fastclose_tests()
fi fi
} }
pedit_action_pkts()
{
tc -n $ns2 -j -s action show action pedit index 100 | \
sed 's/.*"packets":\([0-9]\+\),.*/\1/'
}
fail_tests()
{
# single subflow
if reset_with_fail "Infinite map" 1; then
run_tests $ns1 $ns2 10.0.1.1 128
chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)"
chk_fail_nr 1 -1 invert
fi
}
implicit_tests() implicit_tests()
{ {
# userspace pm type prevents add_addr # userspace pm type prevents add_addr
@ -2658,6 +2766,7 @@ all_tests_sorted=(
d@deny_join_id0_tests d@deny_join_id0_tests
m@fullmesh_tests m@fullmesh_tests
z@fastclose_tests z@fastclose_tests
F@fail_tests
I@implicit_tests I@implicit_tests
) )