From 1cc4a018669f2fb18c10010f1a7ab3f6fb688cef Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 20 Aug 2017 13:38:07 +0800
Subject: [PATCH 1/9] netfilter: ipvs: fix the issue that sctp_conn_schedule
 drops non-INIT packet

Commit 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP
packets") changed to check packet type early. It introduced a side
effect: if it's not a INIT packet, ports will be set as  NULL, and
the packet will be dropped later.

It caused that sctp couldn't create connection when ipvs module is
loaded and any scheduler is registered on server.

Li Shuang reproduced it by running the cmds on sctp server:
  # ipvsadm -A -t 1.1.1.1:80 -s rr
  # ipvsadm -D -t 1.1.1.1:80
then the server could't work any more.

This patch is to return 1 when it's not an INIT packet. It means ipvs
will accept it without creating a conn for it, just like what it does
for tcp.

Fixes: 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP packets")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index e1efa446b305..81f08198b125 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -24,9 +24,12 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
 		if (sh) {
 			sch = skb_header_pointer(skb, iph->len + sizeof(_sctph),
 						 sizeof(_schunkh), &_schunkh);
-			if (sch && (sch->type == SCTP_CID_INIT ||
-				    sysctl_sloppy_sctp(ipvs)))
+			if (sch) {
+				if (!(sysctl_sloppy_sctp(ipvs) ||
+				      sch->type == SCTP_CID_INIT))
+					return 1;
 				ports = &sh->source;
+			}
 		}
 	} else {
 		ports = skb_header_pointer(

From 68913a018f6082f8f90abb8ff9114435ef45dff7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 20 Aug 2017 13:38:08 +0800
Subject: [PATCH 2/9] netfilter: ipvs: do not create conn for ABORT packet in
 sctp_conn_schedule

There's no reason for ipvs to create a conn for an ABORT packet
even if sysctl_sloppy_sctp is set.

This patch is to accept it without creating a conn, just as ipvs
does for tcp's RST packet.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 81f08198b125..57c8ee66491e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -25,7 +25,8 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
 			sch = skb_header_pointer(skb, iph->len + sizeof(_sctph),
 						 sizeof(_schunkh), &_schunkh);
 			if (sch) {
-				if (!(sysctl_sloppy_sctp(ipvs) ||
+				if (sch->type == SCTP_CID_ABORT ||
+				    !(sysctl_sloppy_sctp(ipvs) ||
 				      sch->type == SCTP_CID_INIT))
 					return 1;
 				ports = &sh->source;

From 75c2631468e8af554057246b2413e738dd96af3d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 31 Aug 2017 13:45:24 +0200
Subject: [PATCH 3/9] netfilter: nf_nat: don't bug when mapping already exists

It seems preferrable to limp along if we have a conflicting mapping,
its certainly better than a BUG().

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 40573aa6c133..dc3519cc7209 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -416,7 +416,9 @@ nf_nat_setup_info(struct nf_conn *ct,
 
 	WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
 		maniptype != NF_NAT_MANIP_DST);
-	BUG_ON(nf_nat_initialized(ct, maniptype));
+
+	if (WARN_ON(nf_nat_initialized(ct, maniptype)))
+		return NF_DROP;
 
 	/* What we've got will look like inverse of reply. Normally
 	 * this is what is in the conntrack, except for prior

From a5d7a714569199f909cd60ff7074107bf15c7db4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Sep 2017 22:41:03 +0200
Subject: [PATCH 4/9] netfilter: xtables: add scheduling opportunity in
 get_counters

There are reports about spurious softlockups during iptables-restore, a
backtrace i saw points at get_counters -- it uses a sequence lock and also
has unbounded restart loop.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 1 +
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e04457198f93..9e2770fd00be 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -629,6 +629,7 @@ static void get_counters(const struct xt_table_info *t,
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
+			cond_resched();
 		}
 	}
 }
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 576cba2b57e9..39286e543ee6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -776,6 +776,7 @@ get_counters(const struct xt_table_info *t,
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i; /* macro does multi eval of i */
+			cond_resched();
 		}
 	}
 }
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 54b1e75eded1..01bd3ee5ebc6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -795,6 +795,7 @@ get_counters(const struct xt_table_info *t,
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
+			cond_resched();
 		}
 	}
 }

From e1bf1687740ce1a3598a1c5e452b852ff2190682 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Sep 2017 14:39:51 +0200
Subject: [PATCH 5/9] netfilter: nat: Revert "netfilter: nat: convert nat bysrc
 hash to rhashtable"

This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.

It was not a good idea. The custom hash table was a much better
fit for this purpose.

A fast lookup is not essential, in fact for most cases there is no lookup
at all because original tuple is not taken and can be used as-is.
What needs to be fast is insertion and deletion.

rhlist removal however requires a rhlist walk.
We can have thousands of entries in such a list if source port/addresses
are reused for multiple flows, if this happens removal requests are so
expensive that deletions of a few thousand flows can take several
seconds(!).

The advantages that we got from rhashtable are:
1) table auto-sizing
2) multiple locks

1) would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource
table are not a problem compared to current deletion cost.
2) is easy to add to custom hash table.

I tried to add hlist_node to rhlist to speed up rhltable_remove but this
isn't doable without changing semantics.  rhltable_remove_fast will
check that the to-be-deleted object is part of the table and that
requires a list walk that we want to avoid.

Furthermore, using hlist_node increases size of struct rhlist_head, which
in turn increases nf_conn size.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
Reported-by: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |   3 +-
 include/net/netfilter/nf_nat.h       |   1 -
 net/netfilter/nf_nat_core.c          | 126 +++++++++++----------------
 3 files changed, 52 insertions(+), 78 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index fdc9c64a1c94..8f3bd30511de 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -17,7 +17,6 @@
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/atomic.h>
-#include <linux/rhashtable.h>
 
 #include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/nf_conntrack_dccp.h>
@@ -77,7 +76,7 @@ struct nf_conn {
 	possible_net_t ct_net;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct rhlist_head nat_bysource;
+	struct hlist_node	nat_bysource;
 #endif
 	/* all members below initialized via memset */
 	u8 __nfct_init_offset[0];
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 05c82a1a4267..b71701302e61 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -1,6 +1,5 @@
 #ifndef _NF_NAT_H
 #define _NF_NAT_H
-#include <linux/rhashtable.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/nf_nat.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index dc3519cc7209..f090419f5f97 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+static DEFINE_SPINLOCK(nf_nat_lock);
+
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 
-struct nf_nat_conn_key {
-	const struct net *net;
-	const struct nf_conntrack_tuple *tuple;
-	const struct nf_conntrack_zone *zone;
-};
-
-static struct rhltable nf_nat_bysource_table;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
 EXPORT_SYMBOL(nf_xfrm_me_harder);
 #endif /* CONFIG_XFRM */
 
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_tuple *t;
-	const struct nf_conn *ct = data;
+	unsigned int hash;
+
+	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
 
-	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 	/* Original src, to ensure we map it consistently if poss. */
+	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 
-	seed ^= net_hash_mix(nf_ct_net(ct));
-	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
-		      t->dst.protonum ^ seed);
+	return reciprocal_scale(hash, nf_nat_htable_size);
 }
 
 /* Is this tuple already taken? (not by us) */
@@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct,
 		t->src.u.all == tuple->src.u.all);
 }
 
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
-			       const void *obj)
-{
-	const struct nf_nat_conn_key *key = arg->key;
-	const struct nf_conn *ct = obj;
-
-	if (!same_src(ct, key->tuple) ||
-	    !net_eq(nf_ct_net(ct), key->net) ||
-	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
-		return 1;
-
-	return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
-	.head_offset = offsetof(struct nf_conn, nat_bysource),
-	.obj_hashfn = nf_nat_bysource_hash,
-	.obj_cmpfn = nf_nat_bysource_cmp,
-	.nelem_hint = 256,
-	.min_size = 1024,
-};
-
 /* Only called for SRC manip */
 static int
 find_appropriate_src(struct net *net,
@@ -216,26 +194,22 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
-	struct nf_nat_conn_key key = {
-		.net = net,
-		.tuple = tuple,
-		.zone = zone
-	};
-	struct rhlist_head *hl, *h;
 
-	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
-			     nf_nat_bysource_params);
+	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+		if (same_src(ct, tuple) &&
+		    net_eq(net, nf_ct_net(ct)) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+			/* Copy source part from reply tuple. */
+			nf_ct_invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
 
-	rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
-		nf_ct_invert_tuplepr(result,
-				     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-		result->dst = tuple->dst;
-
-		if (in_range(l3proto, l4proto, result, range))
-			return 1;
+			if (in_range(l3proto, l4proto, result, range))
+				return 1;
+		}
 	}
-
 	return 0;
 }
 
@@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 
 	/* Can't setup nat info for confirmed ct. */
@@ -449,19 +424,14 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-		struct nf_nat_conn_key key = {
-			.net = nf_ct_net(ct),
-			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-			.zone = nf_ct_zone(ct),
-		};
-		int err;
+		unsigned int srchash;
 
-		err = rhltable_insert_key(&nf_nat_bysource_table,
-					  &key,
-					  &ct->nat_bysource,
-					  nf_nat_bysource_params);
-		if (err)
-			return NF_DROP;
+		srchash = hash_by_src(net,
+				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		spin_lock_bh(&nf_nat_lock);
+		hlist_add_head_rcu(&ct->nat_bysource,
+				   &nf_nat_bysource[srchash]);
+		spin_unlock_bh(&nf_nat_lock);
 	}
 
 	/* It's done. */
@@ -570,8 +540,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * will delete entry from already-freed table.
 	 */
 	clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&ct->nat_bysource);
+	spin_unlock_bh(&nf_nat_lock);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -699,9 +670,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
 /* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
-	if (ct->status & IPS_SRC_NAT_DONE)
-		rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-				nf_nat_bysource_params);
+	if (ct->status & IPS_SRC_NAT_DONE) {
+		spin_lock_bh(&nf_nat_lock);
+		hlist_del_rcu(&ct->nat_bysource);
+		spin_unlock_bh(&nf_nat_lock);
+	}
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -825,13 +798,16 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
-	if (ret)
-		return ret;
+	/* Leave them the same for the moment. */
+	nf_nat_htable_size = nf_conntrack_htable_size;
+
+	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+	if (!nf_nat_bysource)
+		return -ENOMEM;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhltable_destroy(&nf_nat_bysource_table);
+		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -865,8 +841,8 @@ static void __exit nf_nat_cleanup(void)
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
-
-	rhltable_destroy(&nf_nat_bysource_table);
+	synchronize_net();
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 }
 
 MODULE_LICENSE("GPL");

From 8073e960a03bf7b5d5ebfc5ff18ac475e1688f46 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Sep 2017 14:39:52 +0200
Subject: [PATCH 6/9] netfilter: nat: use keyed locks

no need to serialize on a single lock, we can partition the table and
add/delete in parallel to different slots.
This restores one of the advantages that got lost with the rhlist
revert.

Cc: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f090419f5f97..f393a7086025 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,7 +30,7 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
-static DEFINE_SPINLOCK(nf_nat_lock);
+static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
 
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
@@ -425,13 +425,15 @@ nf_nat_setup_info(struct nf_conn *ct,
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		unsigned int srchash;
+		spinlock_t *lock;
 
 		srchash = hash_by_src(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		spin_lock_bh(&nf_nat_lock);
+		lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)];
+		spin_lock_bh(lock);
 		hlist_add_head_rcu(&ct->nat_bysource,
 				   &nf_nat_bysource[srchash]);
-		spin_unlock_bh(&nf_nat_lock);
+		spin_unlock_bh(lock);
 	}
 
 	/* It's done. */
@@ -525,6 +527,16 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
 	return i->status & IPS_NAT_MASK ? 1 : 0;
 }
 
+static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+{
+	unsigned int h;
+
+	h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+	hlist_del_rcu(&ct->nat_bysource);
+	spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+}
+
 static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 {
 	if (nf_nat_proto_remove(ct, data))
@@ -540,9 +552,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * will delete entry from already-freed table.
 	 */
 	clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
-	spin_lock_bh(&nf_nat_lock);
-	hlist_del_rcu(&ct->nat_bysource);
-	spin_unlock_bh(&nf_nat_lock);
+	__nf_nat_cleanup_conntrack(ct);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -670,11 +680,8 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
 /* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
-	if (ct->status & IPS_SRC_NAT_DONE) {
-		spin_lock_bh(&nf_nat_lock);
-		hlist_del_rcu(&ct->nat_bysource);
-		spin_unlock_bh(&nf_nat_lock);
-	}
+	if (ct->status & IPS_SRC_NAT_DONE)
+		__nf_nat_cleanup_conntrack(ct);
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -796,10 +803,12 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
 
 static int __init nf_nat_init(void)
 {
-	int ret;
+	int ret, i;
 
 	/* Leave them the same for the moment. */
 	nf_nat_htable_size = nf_conntrack_htable_size;
+	if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks))
+		nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks);
 
 	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
 	if (!nf_nat_bysource)
@@ -812,6 +821,9 @@ static int __init nf_nat_init(void)
 		return ret;
 	}
 
+	for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++)
+		spin_lock_init(&nf_nat_locks[i]);
+
 	nf_ct_helper_expectfn_register(&follow_master_nat);
 
 	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);

From 74585d4f84379528347630253c42518c5002d2f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Sep 2017 14:47:57 +0200
Subject: [PATCH 7/9] netfilter: core: remove erroneous warn_on

kernel test robot reported:

WARNING: CPU: 0 PID: 1244 at net/netfilter/core.c:218 __nf_hook_entries_try_shrink+0x49/0xcd
[..]

After allowing batching in nf_unregister_net_hooks its possible that an earlier
call to __nf_hook_entries_try_shrink already compacted the list.
If this happens we don't need to do anything.

Fixes: d3ad2c17b4047 ("netfilter: core: batch nf_unregister_net_hooks synchronize_net calls")
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 04fe25abc5f6..52cd2901a097 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -215,7 +215,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
 	if (skip == hook_entries)
 		goto out_assign;
 
-	if (WARN_ON(skip == 0))
+	if (skip == 0)
 		return NULL;
 
 	hook_entries -= skip;

From 05d0eae7c1cf6bf7b19c02b3a97ff457b3317323 Mon Sep 17 00:00:00 2001
From: Zhizhou Tian <zhizhou.tian@gmail.com>
Date: Fri, 8 Sep 2017 11:00:16 +0800
Subject: [PATCH 8/9] netfilter: xt_hashlimit: alloc hashtable with right size

struct xt_byteslimit_htable used hlist_head, but memory allocation is
done through sizeof(struct list_head).

Signed-off-by: Zhizhou Tian <zhizhou.tian@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_hashlimit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10d48234f5f4..962ea4a63d9f 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -279,7 +279,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 		size = cfg->size;
 	} else {
 		size = (totalram_pages << PAGE_SHIFT) / 16384 /
-		       sizeof(struct list_head);
+		       sizeof(struct hlist_head);
 		if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
 			size = 8192;
 		if (size < 16)
@@ -287,7 +287,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 	}
 	/* FIXME: don't use vmalloc() here or anywhere else -HW */
 	hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
-	                sizeof(struct list_head) * size);
+	                sizeof(struct hlist_head) * size);
 	if (hinfo == NULL)
 		return -ENOMEM;
 	*out_hinfo = hinfo;

From 90c4ae4e2c1da9f1eaf846136861af43d4c1ff34 Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Fri, 8 Sep 2017 01:38:58 -0400
Subject: [PATCH 9/9] netfilter: xt_hashlimit: fix build error caused by 64bit
 division

64bit division causes build/link errors on 32bit architectures. It
prints out error messages like:

ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined!

The value of avg passed through by userspace in BYTE mode cannot exceed
U32_MAX. Which means 64bit division in user2rate_bytes is unnecessary.
To fix this I have changed the type of param 'user' to u32.

Since anything greater than U32_MAX is an invalid input we error out in
hashlimit_mt_check_common() when this is the case.

Changes in v2:
	Making return type as u32 would cause an overflow for small
	values of 'user' (for example 2, 3 etc). To avoid this I bumped up
	'r' to u64 again as well as the return type. This is OK since the
	variable that stores the result is u64. We still avoid 64bit
	division here since 'user' is u32.

Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_hashlimit.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 962ea4a63d9f..5da8746f7b88 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -35,6 +35,7 @@
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/netfilter/xt_hashlimit.h>
 #include <linux/mutex.h>
+#include <linux/kernel.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
@@ -527,12 +528,12 @@ static u64 user2rate(u64 user)
 	}
 }
 
-static u64 user2rate_bytes(u64 user)
+static u64 user2rate_bytes(u32 user)
 {
 	u64 r;
 
-	r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL;
-	r = (r - 1) << 4;
+	r = user ? U32_MAX / user : U32_MAX;
+	r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
 	return r;
 }
 
@@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
 		dh->rateinfo.prev_window = 0;
 		dh->rateinfo.current_rate = 0;
 		if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
-			dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg);
+			dh->rateinfo.rate =
+				user2rate_bytes((u32)hinfo->cfg.avg);
 			if (hinfo->cfg.burst)
 				dh->rateinfo.burst =
 					hinfo->cfg.burst * dh->rateinfo.rate;
@@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 
 	/* Check for overflow. */
 	if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
-		if (cfg->avg == 0) {
+		if (cfg->avg == 0 || cfg->avg > U32_MAX) {
 			pr_info("hashlimit invalid rate\n");
 			return -ERANGE;
 		}