Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf
Pablo Neira Ayuso says: ==================== Netfilter fixes for net The following patchset contains Netfilter fixes for net: 1) Do not refresh timeout in SYN_SENT for syn retransmissions. Add selftest for unreplied TCP connection, from Florian Westphal. 2) Fix null dereference from error path with hardware offload in nftables. 3) Remove useless nf_ct_gre_keymap_flush() from netns exit path, from Vasily Averin. 4) Missing rcu read-lock side in ctnetlink helper info dump, also from Vasily. 5) Do not mark RST in the reply direction coming after SYN packet for an out-of-sync entry, from Ali Abdallah and Florian Westphal. 6) Add tcp_ignore_invalid_rst sysctl to allow to disable out of segment RSTs, from Ali. 7) KCSAN fix for nf_conntrack_all_lock(), from Manfred Spraul. 8) Honor NFTA_LAST_SET in nft_last. 9) Fix incorrect arithmetics when restore last_jiffies in nft_last. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
d7fba8ff3e
|
@ -110,6 +110,12 @@ nf_conntrack_tcp_be_liberal - BOOLEAN
|
|||
Be conservative in what you do, be liberal in what you accept from others.
|
||||
If it's non-zero, we mark only out of window RST segments as INVALID.
|
||||
|
||||
nf_conntrack_tcp_ignore_invalid_rst - BOOLEAN
|
||||
- 0 - disabled (default)
|
||||
- 1 - enabled
|
||||
|
||||
If it's 1, we don't mark out of window RST segments as INVALID.
|
||||
|
||||
nf_conntrack_tcp_loose - BOOLEAN
|
||||
- 0 - disabled
|
||||
- not 0 - enabled (default)
|
||||
|
|
|
@ -30,7 +30,6 @@ void nf_conntrack_cleanup_net(struct net *net);
|
|||
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);
|
||||
|
||||
void nf_conntrack_proto_pernet_init(struct net *net);
|
||||
void nf_conntrack_proto_pernet_fini(struct net *net);
|
||||
|
||||
int nf_conntrack_proto_init(void);
|
||||
void nf_conntrack_proto_fini(void);
|
||||
|
|
|
@ -27,6 +27,7 @@ struct nf_tcp_net {
|
|||
u8 tcp_loose;
|
||||
u8 tcp_be_liberal;
|
||||
u8 tcp_max_retrans;
|
||||
u8 tcp_ignore_invalid_rst;
|
||||
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
|
||||
unsigned int offload_timeout;
|
||||
unsigned int offload_pickup;
|
||||
|
|
|
@ -61,7 +61,7 @@ enum nfulnl_attr_type {
|
|||
NFULA_HWTYPE, /* hardware type */
|
||||
NFULA_HWHEADER, /* hardware header */
|
||||
NFULA_HWLEN, /* hardware header length */
|
||||
NFULA_CT, /* nf_conntrack_netlink.h */
|
||||
NFULA_CT, /* nfnetlink_conntrack.h */
|
||||
NFULA_CT_INFO, /* enum ip_conntrack_info */
|
||||
NFULA_VLAN, /* nested attribute: packet vlan info */
|
||||
NFULA_L2HDR, /* full L2 header */
|
||||
|
|
|
@ -51,11 +51,11 @@ enum nfqnl_attr_type {
|
|||
NFQA_IFINDEX_PHYSOUTDEV, /* __u32 ifindex */
|
||||
NFQA_HWADDR, /* nfqnl_msg_packet_hw */
|
||||
NFQA_PAYLOAD, /* opaque data payload */
|
||||
NFQA_CT, /* nf_conntrack_netlink.h */
|
||||
NFQA_CT, /* nfnetlink_conntrack.h */
|
||||
NFQA_CT_INFO, /* enum ip_conntrack_info */
|
||||
NFQA_CAP_LEN, /* __u32 length of captured packet */
|
||||
NFQA_SKB_INFO, /* __u32 skb meta information */
|
||||
NFQA_EXP, /* nf_conntrack_netlink.h */
|
||||
NFQA_EXP, /* nfnetlink_conntrack.h */
|
||||
NFQA_UID, /* __u32 sk uid */
|
||||
NFQA_GID, /* __u32 sk gid */
|
||||
NFQA_SECCTX, /* security context string */
|
||||
|
|
|
@ -149,7 +149,15 @@ static void nf_conntrack_all_lock(void)
|
|||
|
||||
spin_lock(&nf_conntrack_locks_all_lock);
|
||||
|
||||
nf_conntrack_locks_all = true;
|
||||
/* For nf_contrack_locks_all, only the latest time when another
|
||||
* CPU will see an update is controlled, by the "release" of the
|
||||
* spin_lock below.
|
||||
* The earliest time is not controlled, an thus KCSAN could detect
|
||||
* a race when nf_conntract_lock() reads the variable.
|
||||
* WRITE_ONCE() is used to ensure the compiler will not
|
||||
* optimize the write.
|
||||
*/
|
||||
WRITE_ONCE(nf_conntrack_locks_all, true);
|
||||
|
||||
for (i = 0; i < CONNTRACK_LOCKS; i++) {
|
||||
spin_lock(&nf_conntrack_locks[i]);
|
||||
|
@ -2457,7 +2465,6 @@ i_see_dead_people:
|
|||
}
|
||||
|
||||
list_for_each_entry(net, net_exit_list, exit_list) {
|
||||
nf_conntrack_proto_pernet_fini(net);
|
||||
nf_conntrack_ecache_pernet_fini(net);
|
||||
nf_conntrack_expect_pernet_fini(net);
|
||||
free_percpu(net->ct.stat);
|
||||
|
|
|
@ -218,6 +218,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
|
|||
if (!help)
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
helper = rcu_dereference(help->helper);
|
||||
if (!helper)
|
||||
goto out;
|
||||
|
@ -233,9 +234,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
|
|||
|
||||
nla_nest_end(skb, nest_helper);
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
rcu_read_unlock();
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
|
@ -697,13 +697,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
|
|||
#endif
|
||||
}
|
||||
|
||||
void nf_conntrack_proto_pernet_fini(struct net *net)
|
||||
{
|
||||
#ifdef CONFIG_NF_CT_PROTO_GRE
|
||||
nf_ct_gre_keymap_flush(net);
|
||||
#endif
|
||||
}
|
||||
|
||||
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
|
||||
&nf_conntrack_htable_size, 0600);
|
||||
|
||||
|
|
|
@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
|
|||
return &net->ct.nf_ct_proto.gre;
|
||||
}
|
||||
|
||||
void nf_ct_gre_keymap_flush(struct net *net)
|
||||
{
|
||||
struct nf_gre_net *net_gre = gre_pernet(net);
|
||||
struct nf_ct_gre_keymap *km, *tmp;
|
||||
|
||||
spin_lock_bh(&keymap_lock);
|
||||
list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
|
||||
list_del_rcu(&km->list);
|
||||
kfree_rcu(km, rcu);
|
||||
}
|
||||
spin_unlock_bh(&keymap_lock);
|
||||
}
|
||||
|
||||
static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
|
||||
const struct nf_conntrack_tuple *t)
|
||||
{
|
||||
|
|
|
@ -823,6 +823,22 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool tcp_can_early_drop(const struct nf_conn *ct)
|
||||
{
|
||||
switch (ct->proto.tcp.state) {
|
||||
case TCP_CONNTRACK_FIN_WAIT:
|
||||
case TCP_CONNTRACK_LAST_ACK:
|
||||
case TCP_CONNTRACK_TIME_WAIT:
|
||||
case TCP_CONNTRACK_CLOSE:
|
||||
case TCP_CONNTRACK_CLOSE_WAIT:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Returns verdict for packet, or -1 for invalid. */
|
||||
int nf_conntrack_tcp_packet(struct nf_conn *ct,
|
||||
struct sk_buff *skb,
|
||||
|
@ -1030,10 +1046,30 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
|
|||
if (index != TCP_RST_SET)
|
||||
break;
|
||||
|
||||
if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
|
||||
/* If we are closing, tuple might have been re-used already.
|
||||
* last_index, last_ack, and all other ct fields used for
|
||||
* sequence/window validation are outdated in that case.
|
||||
*
|
||||
* As the conntrack can already be expired by GC under pressure,
|
||||
* just skip validation checks.
|
||||
*/
|
||||
if (tcp_can_early_drop(ct))
|
||||
goto in_window;
|
||||
|
||||
/* td_maxack might be outdated if we let a SYN through earlier */
|
||||
if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
|
||||
ct->proto.tcp.last_index != TCP_SYN_SET) {
|
||||
u32 seq = ntohl(th->seq);
|
||||
|
||||
if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
|
||||
/* If we are not in established state and SEQ=0 this is most
|
||||
* likely an answer to a SYN we let go through above (last_index
|
||||
* can be updated due to out-of-order ACKs).
|
||||
*/
|
||||
if (seq == 0 && !nf_conntrack_tcp_established(ct))
|
||||
break;
|
||||
|
||||
if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
|
||||
!tn->tcp_ignore_invalid_rst) {
|
||||
/* Invalid RST */
|
||||
spin_unlock_bh(&ct->lock);
|
||||
nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
|
||||
|
@ -1134,6 +1170,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
|
|||
nf_ct_kill_acct(ct, ctinfo, skb);
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
|
||||
/* do not renew timeout on SYN retransmit.
|
||||
*
|
||||
* Else port reuse by client or NAT middlebox can keep
|
||||
* entry alive indefinitely (including nat info).
|
||||
*/
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
|
||||
* pickup with loose=1. Avoid large ESTABLISHED timeout.
|
||||
*/
|
||||
|
@ -1155,22 +1201,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
|
|||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static bool tcp_can_early_drop(const struct nf_conn *ct)
|
||||
{
|
||||
switch (ct->proto.tcp.state) {
|
||||
case TCP_CONNTRACK_FIN_WAIT:
|
||||
case TCP_CONNTRACK_LAST_ACK:
|
||||
case TCP_CONNTRACK_TIME_WAIT:
|
||||
case TCP_CONNTRACK_CLOSE:
|
||||
case TCP_CONNTRACK_CLOSE_WAIT:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
|
||||
#include <linux/netfilter/nfnetlink.h>
|
||||
|
@ -1437,6 +1467,9 @@ void nf_conntrack_tcp_init_net(struct net *net)
|
|||
*/
|
||||
tn->tcp_be_liberal = 0;
|
||||
|
||||
/* If it's non-zero, we turn off RST sequence number check */
|
||||
tn->tcp_ignore_invalid_rst = 0;
|
||||
|
||||
/* Max number of the retransmitted packets without receiving an (acceptable)
|
||||
* ACK from the destination. If this number is reached, a shorter timer
|
||||
* will be started.
|
||||
|
|
|
@ -579,6 +579,7 @@ enum nf_ct_sysctl_index {
|
|||
#endif
|
||||
NF_SYSCTL_CT_PROTO_TCP_LOOSE,
|
||||
NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
|
||||
NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
|
||||
NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
|
||||
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
|
||||
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
|
||||
|
@ -798,6 +799,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
|
|||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
[NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
|
||||
.procname = "nf_conntrack_tcp_ignore_invalid_rst",
|
||||
.maxlen = sizeof(u8),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dou8vec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
|
||||
.procname = "nf_conntrack_tcp_max_retrans",
|
||||
.maxlen = sizeof(u8),
|
||||
|
@ -1004,6 +1013,7 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
|
|||
XASSIGN(LOOSE, &tn->tcp_loose);
|
||||
XASSIGN(LIBERAL, &tn->tcp_be_liberal);
|
||||
XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
|
||||
XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
|
||||
#undef XASSIGN
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
|
||||
|
|
|
@ -3446,7 +3446,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
|
|||
return 0;
|
||||
|
||||
err_destroy_flow_rule:
|
||||
nft_flow_rule_destroy(flow);
|
||||
if (flow)
|
||||
nft_flow_rule_destroy(flow);
|
||||
err_release_rule:
|
||||
nf_tables_rule_release(&ctx, rule);
|
||||
err_release_expr:
|
||||
|
|
|
@ -23,15 +23,21 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
|
|||
{
|
||||
struct nft_last_priv *priv = nft_expr_priv(expr);
|
||||
u64 last_jiffies;
|
||||
u32 last_set = 0;
|
||||
int err;
|
||||
|
||||
if (tb[NFTA_LAST_MSECS]) {
|
||||
if (tb[NFTA_LAST_SET]) {
|
||||
last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
|
||||
if (last_set == 1)
|
||||
priv->last_set = 1;
|
||||
}
|
||||
|
||||
if (last_set && tb[NFTA_LAST_MSECS]) {
|
||||
err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
priv->last_jiffies = jiffies + (unsigned long)last_jiffies;
|
||||
priv->last_set = 1;
|
||||
priv->last_jiffies = jiffies - (unsigned long)last_jiffies;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -5,7 +5,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
|
|||
conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
|
||||
nft_concat_range.sh nft_conntrack_helper.sh \
|
||||
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
|
||||
ipip-conntrack-mtu.sh
|
||||
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh
|
||||
|
||||
LDLIBS = -lmnl
|
||||
TEST_GEN_FILES = nf-queue
|
||||
|
|
|
@ -0,0 +1,167 @@
|
|||
#!/bin/bash
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# Check that UNREPLIED tcp conntrack will eventually timeout.
|
||||
#
|
||||
|
||||
# Kselftest framework requirement - SKIP code is 4.
|
||||
ksft_skip=4
|
||||
ret=0
|
||||
|
||||
waittime=20
|
||||
sfx=$(mktemp -u "XXXXXXXX")
|
||||
ns1="ns1-$sfx"
|
||||
ns2="ns2-$sfx"
|
||||
|
||||
nft --version > /dev/null 2>&1
|
||||
if [ $? -ne 0 ];then
|
||||
echo "SKIP: Could not run test without nft tool"
|
||||
exit $ksft_skip
|
||||
fi
|
||||
|
||||
ip -Version > /dev/null 2>&1
|
||||
if [ $? -ne 0 ];then
|
||||
echo "SKIP: Could not run test without ip tool"
|
||||
exit $ksft_skip
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
ip netns pids $ns1 | xargs kill 2>/dev/null
|
||||
ip netns pids $ns2 | xargs kill 2>/dev/null
|
||||
|
||||
ip netns del $ns1
|
||||
ip netns del $ns2
|
||||
}
|
||||
|
||||
ipv4() {
|
||||
echo -n 192.168.$1.2
|
||||
}
|
||||
|
||||
check_counter()
|
||||
{
|
||||
ns=$1
|
||||
name=$2
|
||||
expect=$3
|
||||
local lret=0
|
||||
|
||||
cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect")
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
|
||||
ip netns exec $ns2 nft list counter inet filter "$name" 1>&2
|
||||
lret=1
|
||||
fi
|
||||
|
||||
return $lret
|
||||
}
|
||||
|
||||
# Create test namespaces
|
||||
ip netns add $ns1 || exit 1
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
ip netns add $ns2 || exit 1
|
||||
|
||||
# Connect the namespace to the host using a veth pair
|
||||
ip -net $ns1 link add name veth1 type veth peer name veth2
|
||||
ip -net $ns1 link set netns $ns2 dev veth2
|
||||
|
||||
ip -net $ns1 link set up dev lo
|
||||
ip -net $ns2 link set up dev lo
|
||||
ip -net $ns1 link set up dev veth1
|
||||
ip -net $ns2 link set up dev veth2
|
||||
|
||||
ip -net $ns2 addr add 10.11.11.2/24 dev veth2
|
||||
ip -net $ns2 route add default via 10.11.11.1
|
||||
|
||||
ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1
|
||||
|
||||
# add a rule inside NS so we enable conntrack
|
||||
ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT
|
||||
|
||||
ip -net $ns1 addr add 10.11.11.1/24 dev veth1
|
||||
ip -net $ns1 route add 10.99.99.99 via 10.11.11.2
|
||||
|
||||
# Check connectivity works
|
||||
ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1
|
||||
|
||||
ip netns exec $ns2 nc -l -p 8080 < /dev/null &
|
||||
|
||||
# however, conntrack entries are there
|
||||
|
||||
ip netns exec $ns2 nft -f - <<EOF
|
||||
table inet filter {
|
||||
counter connreq { }
|
||||
counter redir { }
|
||||
chain input {
|
||||
type filter hook input priority 0; policy accept;
|
||||
ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
|
||||
ct state new ct status dnat tcp dport 8080 counter name "redir" accept
|
||||
}
|
||||
}
|
||||
EOF
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "ERROR: Could not load nft rules"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
|
||||
|
||||
echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
|
||||
ip netns exec $ns1 bash -c 'while true ; do
|
||||
nc -p 60000 10.99.99.99 80
|
||||
sleep 1
|
||||
done' &
|
||||
|
||||
sleep 1
|
||||
|
||||
ip netns exec $ns2 nft -f - <<EOF
|
||||
table inet nat {
|
||||
chain prerouting {
|
||||
type nat hook prerouting priority 0; policy accept;
|
||||
ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
|
||||
}
|
||||
}
|
||||
EOF
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "ERROR: Could not load nat redirect"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
|
||||
if [ $count -eq 0 ]; then
|
||||
echo "ERROR: $ns2 did not pick up tcp connection from peer"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect"
|
||||
for i in $(seq 1 $waittime); do
|
||||
echo -n "."
|
||||
|
||||
sleep 1
|
||||
|
||||
count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
|
||||
if [ $count -gt 0 ]; then
|
||||
echo
|
||||
echo "PASS: redirection took effect after $i seconds"
|
||||
break
|
||||
fi
|
||||
|
||||
m=$((i%20))
|
||||
if [ $m -eq 0 ]; then
|
||||
echo " waited for $i seconds"
|
||||
fi
|
||||
done
|
||||
|
||||
expect="packets 1 bytes 60"
|
||||
check_counter "$ns2" "redir" "$expect"
|
||||
if [ $? -ne 0 ]; then
|
||||
ret=1
|
||||
fi
|
||||
|
||||
if [ $ret -eq 0 ];then
|
||||
echo "PASS: redirection counter has expected values"
|
||||
else
|
||||
echo "ERROR: no tcp connection was redirected"
|
||||
fi
|
||||
|
||||
exit $ret
|
Loading…
Reference in New Issue