Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

The following patchset contains Netfilter/IPVS fixes for net:

1) Add selftest for vrf+conntrack, from Florian Westphal.

2) Extend nfqueue selftest to cover nfqueue, also from Florian.

3) Remove duplicated include in nft_payload, from Wan Jiabing.

4) Several improvements to the nat port shadowing selftest,
   from Phil Sutter.

5) Fix filtering of reply tuple in ctnetlink, from Florent Fourcot.

6) Do not override error with -EINVAL in filter setup path, also
   from Florent.

7) Honor sysctl_expire_nodest_conn regardless conn_reuse_mode for
   reused connections, from yangxingwu.

8) Replace snprintf() by sysfs_emit() in xt_IDLETIMER as reported
   by Coccinelle, from Jing Yao.

9) Incorrect IPv6 tunnel match in flowtable offload, from Will
   Mortensen.

10) Switch port shadow selftest to use socat, from Florian Westphal.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-11-19 11:00:01 +00:00
commit d6821c5bc6
10 changed files with 310 additions and 27 deletions

View File

@ -37,8 +37,7 @@ conn_reuse_mode - INTEGER
0: disable any special handling on port reuse. The new
connection will be delivered to the same real server that was
servicing the previous connection. This will effectively
disable expire_nodest_conn.
servicing the previous connection.
bit 1: enable rescheduling of new connections when it is safe.
That is, whenever expire_nodest_conn and for TCP sockets, when

View File

@ -1919,7 +1919,6 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
int conn_reuse_mode;
struct sock *sk;
int af = state->pf;
@ -1997,15 +1996,16 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state
cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
ipvs, af, skb, &iph);
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) {
int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
bool old_ct = false, resched = false;
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) {
resched = true;
old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
} else if (is_new_conn_expected(cp, conn_reuse_mode)) {
} else if (conn_reuse_mode &&
is_new_conn_expected(cp, conn_reuse_mode)) {
old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
if (!atomic_read(&cp->n_control)) {
resched = true;

View File

@ -1011,11 +1011,9 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
CTA_TUPLE_REPLY,
filter->family,
&filter->zone,
filter->orig_flags);
if (err < 0) {
err = -EINVAL;
filter->reply_flags);
if (err < 0)
goto err_filter;
}
}
return filter;

View File

@ -65,11 +65,11 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
sizeof(struct in6_addr));
if (memcmp(&key->enc_ipv6.src, &in6addr_any,
sizeof(struct in6_addr)))
memset(&key->enc_ipv6.src, 0xff,
memset(&mask->enc_ipv6.src, 0xff,
sizeof(struct in6_addr));
if (memcmp(&key->enc_ipv6.dst, &in6addr_any,
sizeof(struct in6_addr)))
memset(&key->enc_ipv6.dst, 0xff,
memset(&mask->enc_ipv6.dst, 0xff,
sizeof(struct in6_addr));
enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;

View File

@ -22,7 +22,6 @@
#include <linux/icmpv6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/ip.h>
#include <net/sctp/checksum.h>
static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,

View File

@ -85,9 +85,9 @@ static ssize_t idletimer_tg_show(struct device *dev,
mutex_unlock(&list_mutex);
if (time_after(expires, jiffies) || ktimespec.tv_sec > 0)
return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff);
return sysfs_emit(buf, "%ld\n", time_diff);
return snprintf(buf, PAGE_SIZE, "0\n");
return sysfs_emit(buf, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)

View File

@ -5,7 +5,8 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
nft_concat_range.sh nft_conntrack_helper.sh \
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
conntrack_vrf.sh
LDLIBS = -lmnl
TEST_GEN_FILES = nf-queue

View File

@ -0,0 +1,219 @@
#!/bin/sh
# This script demonstrates interaction of conntrack and vrf.
# The vrf driver calls the netfilter hooks again, with oif/iif
# pointing at the VRF device.
#
# For ingress, this means first iteration has iifname of lower/real
# device. In this script, thats veth0.
# Second iteration is iifname set to vrf device, tvrf in this script.
#
# For egress, this is reversed: first iteration has the vrf device,
# second iteration is done with the lower/real/veth0 device.
#
# test_ct_zone_in demonstrates unexpected change of nftables
# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack
# connection on VRF rcv"
#
# It was possible to assign conntrack zone to a packet (or mark it for
# `notracking`) in the prerouting chain before conntrack, based on real iif.
#
# After the change, the zone assignment is lost and the zone is assigned based
# on the VRF master interface (in case such a rule exists).
# assignment is lost. Instead, assignment based on the `iif` matching
# Thus it is impossible to distinguish packets based on the original
# interface.
#
# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem
# that was supposed to be fixed by the commit mentioned above to make sure
# that any fix to test case 1 won't break masquerade again.
ksft_skip=4
IP0=172.30.30.1
IP1=172.30.30.2
PFXL=30
ret=0
sfx=$(mktemp -u "XXXXXXXX")
ns0="ns0-$sfx"
ns1="ns1-$sfx"
cleanup()
{
ip netns pids $ns0 | xargs kill 2>/dev/null
ip netns pids $ns1 | xargs kill 2>/dev/null
ip netns del $ns0 $ns1
}
nft --version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without nft tool"
exit $ksft_skip
fi
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
ip netns add "$ns0"
if [ $? -ne 0 ];then
echo "SKIP: Could not create net namespace $ns0"
exit $ksft_skip
fi
ip netns add "$ns1"
trap cleanup EXIT
ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0
ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not add veth device"
exit $ksft_skip
fi
ip -net $ns0 li add tvrf type vrf table 9876
if [ $? -ne 0 ];then
echo "SKIP: Could not add vrf device"
exit $ksft_skip
fi
ip -net $ns0 li set lo up
ip -net $ns0 li set veth0 master tvrf
ip -net $ns0 li set tvrf up
ip -net $ns0 li set veth0 up
ip -net $ns1 li set veth0 up
ip -net $ns0 addr add $IP0/$PFXL dev veth0
ip -net $ns1 addr add $IP1/$PFXL dev veth0
ip netns exec $ns1 iperf3 -s > /dev/null 2>&1&
if [ $? -ne 0 ];then
echo "SKIP: Could not start iperf3"
exit $ksft_skip
fi
# test vrf ingress handling.
# The incoming connection should be placed in conntrack zone 1,
# as decided by the first iteration of the ruleset.
test_ct_zone_in()
{
ip netns exec $ns0 nft -f - <<EOF
table testct {
chain rawpre {
type filter hook prerouting priority raw;
iif { veth0, tvrf } counter meta nftrace set 1
iif veth0 counter ct zone set 1 counter return
iif tvrf counter ct zone set 2 counter return
ip protocol icmp counter
notrack counter
}
chain rawout {
type filter hook output priority raw;
oif veth0 counter ct zone set 1 counter return
oif tvrf counter ct zone set 2 counter return
notrack counter
}
}
EOF
ip netns exec $ns1 ping -W 1 -c 1 -I veth0 $IP0 > /dev/null
# should be in zone 1, not zone 2
count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l)
if [ $count -eq 1 ]; then
echo "PASS: entry found in conntrack zone 1"
else
echo "FAIL: entry not found in conntrack zone 1"
count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l)
if [ $count -eq 1 ]; then
echo "FAIL: entry found in zone 2 instead"
else
echo "FAIL: entry not in zone 1 or 2, dumping table"
ip netns exec $ns0 conntrack -L
ip netns exec $ns0 nft list ruleset
fi
fi
}
# add masq rule that gets evaluated w. outif set to vrf device.
# This tests the first iteration of the packet through conntrack,
# oifname is the vrf device.
test_masquerade_vrf()
{
ip netns exec $ns0 conntrack -F 2>/dev/null
ip netns exec $ns0 nft -f - <<EOF
flush ruleset
table ip nat {
chain postrouting {
type nat hook postrouting priority 0;
# NB: masquerade should always be combined with 'oif(name) bla',
# lack of this is intentional here, we want to exercise double-snat.
ip saddr 172.30.30.0/30 counter masquerade random
}
}
EOF
ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null
if [ $? -ne 0 ]; then
echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device"
ret=1
return
fi
# must also check that nat table was evaluated on second (lower device) iteration.
ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2'
if [ $? -eq 0 ]; then
echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device"
else
echo "FAIL: vrf masq rule has unexpected counter value"
ret=1
fi
}
# add masq rule that gets evaluated w. outif set to veth device.
# This tests the 2nd iteration of the packet through conntrack,
# oifname is the lower device (veth0 in this case).
test_masquerade_veth()
{
ip netns exec $ns0 conntrack -F 2>/dev/null
ip netns exec $ns0 nft -f - <<EOF
flush ruleset
table ip nat {
chain postrouting {
type nat hook postrouting priority 0;
meta oif veth0 ip saddr 172.30.30.0/30 counter masquerade random
}
}
EOF
ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null
if [ $? -ne 0 ]; then
echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device"
ret=1
return
fi
# must also check that nat table was evaluated on second (lower device) iteration.
ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2'
if [ $? -eq 0 ]; then
echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device"
else
echo "FAIL: vrf masq rule has unexpected counter value"
ret=1
fi
}
test_ct_zone_in
test_masquerade_vrf
test_masquerade_veth
exit $ret

View File

@ -759,19 +759,21 @@ test_port_shadow()
local result=""
local logmsg=""
echo ROUTER | ip netns exec "$ns0" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 &
nc_r=$!
echo CLIENT | ip netns exec "$ns2" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 &
nc_c=$!
# make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405.
echo "fake-entry" | ip netns exec "$ns2" nc -w 1 -p 1405 -u "$daddrc" 41404 > /dev/null
echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405
echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 &
sc_r=$!
echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport &
sc_c=$!
sleep 0.3
# ns1 tries to connect to ns0:1405. With default settings this should connect
# to client, it matches the conntrack entry created above.
result=$(echo "" | ip netns exec "$ns1" nc -w 1 -p 41404 -u "$daddrs" 1405)
result=$(echo "data" | ip netns exec "$ns1" timeout 1 socat - UDP:"$daddrs":1405,sourceport=41404)
if [ "$result" = "$expect" ] ;then
echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}"
@ -780,7 +782,7 @@ test_port_shadow()
ret=1
fi
kill $nc_r $nc_c 2>/dev/null
kill $sc_r $sc_c 2>/dev/null
# flush udp entries for next test round, if any
ip netns exec "$ns0" conntrack -F >/dev/null 2>&1
@ -816,11 +818,10 @@ table $family raw {
chain prerouting {
type filter hook prerouting priority -300; policy accept;
meta iif veth0 udp dport 1405 notrack
udp dport 1405 notrack
}
chain output {
type filter hook output priority -300; policy accept;
udp sport 1405 notrack
meta oif veth0 udp sport 1405 notrack
}
}
EOF
@ -851,6 +852,18 @@ test_port_shadowing()
{
local family="ip"
conntrack -h >/dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run nat port shadowing test without conntrack tool"
return
fi
socat -h > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run nat port shadowing test without socat tool"
return
fi
ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null

View File

@ -16,6 +16,10 @@ timeout=4
cleanup()
{
ip netns pids ${ns1} | xargs kill 2>/dev/null
ip netns pids ${ns2} | xargs kill 2>/dev/null
ip netns pids ${nsrouter} | xargs kill 2>/dev/null
ip netns del ${ns1}
ip netns del ${ns2}
ip netns del ${nsrouter}
@ -332,6 +336,55 @@ EOF
echo "PASS: tcp via loopback and re-queueing"
}
test_icmp_vrf() {
ip -net $ns1 link add tvrf type vrf table 9876
if [ $? -ne 0 ];then
echo "SKIP: Could not add vrf device"
return
fi
ip -net $ns1 li set eth0 master tvrf
ip -net $ns1 li set tvrf up
ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876
ip netns exec ${ns1} nft -f /dev/stdin <<EOF
flush ruleset
table inet filter {
chain output {
type filter hook output priority 0; policy accept;
meta oifname "tvrf" icmp type echo-request counter queue num 1
meta oifname "eth0" icmp type echo-request counter queue num 1
}
chain post {
type filter hook postrouting priority 0; policy accept;
meta oifname "tvrf" icmp type echo-request counter queue num 1
meta oifname "eth0" icmp type echo-request counter queue num 1
}
}
EOF
ip netns exec ${ns1} ./nf-queue -q 1 -t $timeout &
local nfqpid=$!
sleep 1
ip netns exec ${ns1} ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null
for n in output post; do
for d in tvrf eth0; do
ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"
if [ $? -ne 0 ] ; then
echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2
ip netns exec ${ns1} nft list ruleset
ret=1
return
fi
done
done
wait $nfqpid
[ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf"
wait 2>/dev/null
}
ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
@ -372,5 +425,6 @@ test_queue 20
test_tcp_forward
test_tcp_localhost
test_tcp_localhost_requeue
test_icmp_vrf
exit $ret