Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

The following patchset contains Netfilter fixes for net:

1) Do not refresh timeout in SYN_SENT for syn retransmissions.
   Add selftest for unreplied TCP connection, from Florian Westphal.

2) Fix null dereference from error path with hardware offload
   in nftables.

3) Remove useless nf_ct_gre_keymap_flush() from netns exit path,
   from Vasily Averin.

4) Missing rcu read-lock side in ctnetlink helper info dump,
   also from Vasily.

5) Do not mark RST in the reply direction coming after SYN packet
   for an out-of-sync entry, from Ali Abdallah and Florian Westphal.

6) Add tcp_ignore_invalid_rst sysctl to allow to disable out of
   segment RSTs, from Ali.

7) KCSAN fix for nf_conntrack_all_lock(), from Manfred Spraul.

8) Honor NFTA_LAST_SET in nft_last.

9) Fix incorrect arithmetics when restore last_jiffies in nft_last.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-07-07 14:00:14 -07:00
commit d7fba8ff3e
15 changed files with 262 additions and 49 deletions

View File

@ -110,6 +110,12 @@ nf_conntrack_tcp_be_liberal - BOOLEAN
Be conservative in what you do, be liberal in what you accept from others.
If it's non-zero, we mark only out of window RST segments as INVALID.
nf_conntrack_tcp_ignore_invalid_rst - BOOLEAN
- 0 - disabled (default)
- 1 - enabled
If it's 1, we don't mark out of window RST segments as INVALID.
nf_conntrack_tcp_loose - BOOLEAN
- 0 - disabled
- not 0 - enabled (default)

View File

@ -30,7 +30,6 @@ void nf_conntrack_cleanup_net(struct net *net);
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);
void nf_conntrack_proto_pernet_init(struct net *net);
void nf_conntrack_proto_pernet_fini(struct net *net);
int nf_conntrack_proto_init(void);
void nf_conntrack_proto_fini(void);

View File

@ -27,6 +27,7 @@ struct nf_tcp_net {
u8 tcp_loose;
u8 tcp_be_liberal;
u8 tcp_max_retrans;
u8 tcp_ignore_invalid_rst;
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
unsigned int offload_timeout;
unsigned int offload_pickup;

View File

@ -61,7 +61,7 @@ enum nfulnl_attr_type {
NFULA_HWTYPE, /* hardware type */
NFULA_HWHEADER, /* hardware header */
NFULA_HWLEN, /* hardware header length */
NFULA_CT, /* nf_conntrack_netlink.h */
NFULA_CT, /* nfnetlink_conntrack.h */
NFULA_CT_INFO, /* enum ip_conntrack_info */
NFULA_VLAN, /* nested attribute: packet vlan info */
NFULA_L2HDR, /* full L2 header */

View File

@ -51,11 +51,11 @@ enum nfqnl_attr_type {
NFQA_IFINDEX_PHYSOUTDEV, /* __u32 ifindex */
NFQA_HWADDR, /* nfqnl_msg_packet_hw */
NFQA_PAYLOAD, /* opaque data payload */
NFQA_CT, /* nf_conntrack_netlink.h */
NFQA_CT, /* nfnetlink_conntrack.h */
NFQA_CT_INFO, /* enum ip_conntrack_info */
NFQA_CAP_LEN, /* __u32 length of captured packet */
NFQA_SKB_INFO, /* __u32 skb meta information */
NFQA_EXP, /* nf_conntrack_netlink.h */
NFQA_EXP, /* nfnetlink_conntrack.h */
NFQA_UID, /* __u32 sk uid */
NFQA_GID, /* __u32 sk gid */
NFQA_SECCTX, /* security context string */

View File

@ -149,7 +149,15 @@ static void nf_conntrack_all_lock(void)
spin_lock(&nf_conntrack_locks_all_lock);
nf_conntrack_locks_all = true;
/* For nf_contrack_locks_all, only the latest time when another
* CPU will see an update is controlled, by the "release" of the
* spin_lock below.
* The earliest time is not controlled, an thus KCSAN could detect
* a race when nf_conntract_lock() reads the variable.
* WRITE_ONCE() is used to ensure the compiler will not
* optimize the write.
*/
WRITE_ONCE(nf_conntrack_locks_all, true);
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_lock(&nf_conntrack_locks[i]);
@ -2457,7 +2465,6 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
nf_conntrack_proto_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);

View File

@ -218,6 +218,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
if (!help)
return 0;
rcu_read_lock();
helper = rcu_dereference(help->helper);
if (!helper)
goto out;
@ -233,9 +234,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
nla_nest_end(skb, nest_helper);
out:
rcu_read_unlock();
return 0;
nla_put_failure:
rcu_read_unlock();
return -1;
}

View File

@ -697,13 +697,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
#endif
}
void nf_conntrack_proto_pernet_fini(struct net *net)
{
#ifdef CONFIG_NF_CT_PROTO_GRE
nf_ct_gre_keymap_flush(net);
#endif
}
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);

View File

@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
return &net->ct.nf_ct_proto.gre;
}
void nf_ct_gre_keymap_flush(struct net *net)
{
struct nf_gre_net *net_gre = gre_pernet(net);
struct nf_ct_gre_keymap *km, *tmp;
spin_lock_bh(&keymap_lock);
list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
list_del_rcu(&km->list);
kfree_rcu(km, rcu);
}
spin_unlock_bh(&keymap_lock);
}
static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
const struct nf_conntrack_tuple *t)
{

View File

@ -823,6 +823,22 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
}
static bool tcp_can_early_drop(const struct nf_conn *ct)
{
switch (ct->proto.tcp.state) {
case TCP_CONNTRACK_FIN_WAIT:
case TCP_CONNTRACK_LAST_ACK:
case TCP_CONNTRACK_TIME_WAIT:
case TCP_CONNTRACK_CLOSE:
case TCP_CONNTRACK_CLOSE_WAIT:
return true;
default:
break;
}
return false;
}
/* Returns verdict for packet, or -1 for invalid. */
int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@ -1030,10 +1046,30 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
if (index != TCP_RST_SET)
break;
if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
/* If we are closing, tuple might have been re-used already.
* last_index, last_ack, and all other ct fields used for
* sequence/window validation are outdated in that case.
*
* As the conntrack can already be expired by GC under pressure,
* just skip validation checks.
*/
if (tcp_can_early_drop(ct))
goto in_window;
/* td_maxack might be outdated if we let a SYN through earlier */
if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
ct->proto.tcp.last_index != TCP_SYN_SET) {
u32 seq = ntohl(th->seq);
if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
/* If we are not in established state and SEQ=0 this is most
* likely an answer to a SYN we let go through above (last_index
* can be updated due to out-of-order ACKs).
*/
if (seq == 0 && !nf_conntrack_tcp_established(ct))
break;
if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
!tn->tcp_ignore_invalid_rst) {
/* Invalid RST */
spin_unlock_bh(&ct->lock);
nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
@ -1134,6 +1170,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
/* do not renew timeout on SYN retransmit.
*
* Else port reuse by client or NAT middlebox can keep
* entry alive indefinitely (including nat info).
*/
return NF_ACCEPT;
}
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
* pickup with loose=1. Avoid large ESTABLISHED timeout.
*/
@ -1155,22 +1201,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
static bool tcp_can_early_drop(const struct nf_conn *ct)
{
switch (ct->proto.tcp.state) {
case TCP_CONNTRACK_FIN_WAIT:
case TCP_CONNTRACK_LAST_ACK:
case TCP_CONNTRACK_TIME_WAIT:
case TCP_CONNTRACK_CLOSE:
case TCP_CONNTRACK_CLOSE_WAIT:
return true;
default:
break;
}
return false;
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@ -1437,6 +1467,9 @@ void nf_conntrack_tcp_init_net(struct net *net)
*/
tn->tcp_be_liberal = 0;
/* If it's non-zero, we turn off RST sequence number check */
tn->tcp_ignore_invalid_rst = 0;
/* Max number of the retransmitted packets without receiving an (acceptable)
* ACK from the destination. If this number is reached, a shorter timer
* will be started.

View File

@ -579,6 +579,7 @@ enum nf_ct_sysctl_index {
#endif
NF_SYSCTL_CT_PROTO_TCP_LOOSE,
NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
@ -798,6 +799,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
.procname = "nf_conntrack_tcp_ignore_invalid_rst",
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
.procname = "nf_conntrack_tcp_max_retrans",
.maxlen = sizeof(u8),
@ -1004,6 +1013,7 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
XASSIGN(LOOSE, &tn->tcp_loose);
XASSIGN(LIBERAL, &tn->tcp_be_liberal);
XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
#undef XASSIGN
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)

View File

@ -3446,7 +3446,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
return 0;
err_destroy_flow_rule:
nft_flow_rule_destroy(flow);
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
nf_tables_rule_release(&ctx, rule);
err_release_expr:

View File

@ -23,15 +23,21 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
{
struct nft_last_priv *priv = nft_expr_priv(expr);
u64 last_jiffies;
u32 last_set = 0;
int err;
if (tb[NFTA_LAST_MSECS]) {
if (tb[NFTA_LAST_SET]) {
last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
if (last_set == 1)
priv->last_set = 1;
}
if (last_set && tb[NFTA_LAST_MSECS]) {
err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
if (err < 0)
return err;
priv->last_jiffies = jiffies + (unsigned long)last_jiffies;
priv->last_set = 1;
priv->last_jiffies = jiffies - (unsigned long)last_jiffies;
}
return 0;

View File

@ -5,7 +5,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
nft_concat_range.sh nft_conntrack_helper.sh \
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
ipip-conntrack-mtu.sh
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh
LDLIBS = -lmnl
TEST_GEN_FILES = nf-queue

View File

@ -0,0 +1,167 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Check that UNREPLIED tcp conntrack will eventually timeout.
#
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
ret=0
waittime=20
sfx=$(mktemp -u "XXXXXXXX")
ns1="ns1-$sfx"
ns2="ns2-$sfx"
nft --version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without nft tool"
exit $ksft_skip
fi
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
cleanup() {
ip netns pids $ns1 | xargs kill 2>/dev/null
ip netns pids $ns2 | xargs kill 2>/dev/null
ip netns del $ns1
ip netns del $ns2
}
ipv4() {
echo -n 192.168.$1.2
}
check_counter()
{
ns=$1
name=$2
expect=$3
local lret=0
cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect")
if [ $? -ne 0 ]; then
echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
ip netns exec $ns2 nft list counter inet filter "$name" 1>&2
lret=1
fi
return $lret
}
# Create test namespaces
ip netns add $ns1 || exit 1
trap cleanup EXIT
ip netns add $ns2 || exit 1
# Connect the namespace to the host using a veth pair
ip -net $ns1 link add name veth1 type veth peer name veth2
ip -net $ns1 link set netns $ns2 dev veth2
ip -net $ns1 link set up dev lo
ip -net $ns2 link set up dev lo
ip -net $ns1 link set up dev veth1
ip -net $ns2 link set up dev veth2
ip -net $ns2 addr add 10.11.11.2/24 dev veth2
ip -net $ns2 route add default via 10.11.11.1
ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1
# add a rule inside NS so we enable conntrack
ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT
ip -net $ns1 addr add 10.11.11.1/24 dev veth1
ip -net $ns1 route add 10.99.99.99 via 10.11.11.2
# Check connectivity works
ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1
ip netns exec $ns2 nc -l -p 8080 < /dev/null &
# however, conntrack entries are there
ip netns exec $ns2 nft -f - <<EOF
table inet filter {
counter connreq { }
counter redir { }
chain input {
type filter hook input priority 0; policy accept;
ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
ct state new ct status dnat tcp dport 8080 counter name "redir" accept
}
}
EOF
if [ $? -ne 0 ]; then
echo "ERROR: Could not load nft rules"
exit 1
fi
ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
ip netns exec $ns1 bash -c 'while true ; do
nc -p 60000 10.99.99.99 80
sleep 1
done' &
sleep 1
ip netns exec $ns2 nft -f - <<EOF
table inet nat {
chain prerouting {
type nat hook prerouting priority 0; policy accept;
ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
}
}
EOF
if [ $? -ne 0 ]; then
echo "ERROR: Could not load nat redirect"
exit 1
fi
count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
if [ $count -eq 0 ]; then
echo "ERROR: $ns2 did not pick up tcp connection from peer"
exit 1
fi
echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect"
for i in $(seq 1 $waittime); do
echo -n "."
sleep 1
count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
if [ $count -gt 0 ]; then
echo
echo "PASS: redirection took effect after $i seconds"
break
fi
m=$((i%20))
if [ $m -eq 0 ]; then
echo " waited for $i seconds"
fi
done
expect="packets 1 bytes 60"
check_counter "$ns2" "redir" "$expect"
if [ $? -ne 0 ]; then
ret=1
fi
if [ $ret -eq 0 ];then
echo "PASS: redirection counter has expected values"
else
echo "ERROR: no tcp connection was redirected"
fi
exit $ret