Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter/IPVS fixes for net

The following patchset contains Netfilter/IPVS fixes for your net tree,
they are:

1) Fix SCTP connection setup when IPVS module is loaded and any scheduler
   is registered, from Xin Long.

2) Don't create a SCTP connection from SCTP ABORT packets, also from
   Xin Long.

3) WARN_ON() and drop packet, instead of BUG_ON() races when calling
   nf_nat_setup_info(). This is specifically a longstanding problem
   when br_netfilter with conntrack support is in place, patch from
   Florian Westphal.

4) Avoid softlock splats via iptables-restore, also from Florian.

5) Revert NAT hashtable conversion to rhashtable, semantics of rhlist
   are different from our simple NAT hashtable, this has been causing
   problems in the recent Linux kernel releases. From Florian.

6) Add per-bucket spinlock for NAT hashtable, so at least we restore
   one of the benefits we got from the previous rhashtable conversion.

7) Fix incorrect hashtable size in memory allocation in xt_hashlimit,
   from Zhizhou Tian.

8) Fix build/link problems with hashlimit and 32-bit arches, to address
   recent fallout from a new hashlimit mode, from Vishwanath Pai.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-09-08 11:35:55 -07:00
commit 1080746110
9 changed files with 86 additions and 89 deletions

View File

@ -17,7 +17,6 @@
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/rhashtable.h>
#include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_tcp.h>
#include <linux/netfilter/nf_conntrack_dccp.h> #include <linux/netfilter/nf_conntrack_dccp.h>
@ -77,7 +76,7 @@ struct nf_conn {
possible_net_t ct_net; possible_net_t ct_net;
#if IS_ENABLED(CONFIG_NF_NAT) #if IS_ENABLED(CONFIG_NF_NAT)
struct rhlist_head nat_bysource; struct hlist_node nat_bysource;
#endif #endif
/* all members below initialized via memset */ /* all members below initialized via memset */
u8 __nfct_init_offset[0]; u8 __nfct_init_offset[0];

View File

@ -1,6 +1,5 @@
#ifndef _NF_NAT_H #ifndef _NF_NAT_H
#define _NF_NAT_H #define _NF_NAT_H
#include <linux/rhashtable.h>
#include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4.h>
#include <linux/netfilter/nf_nat.h> #include <linux/netfilter/nf_nat.h>
#include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_tuple.h>

View File

@ -629,6 +629,7 @@ static void get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt); ADD_COUNTER(counters[i], bcnt, pcnt);
++i; ++i;
cond_resched();
} }
} }
} }

View File

@ -776,6 +776,7 @@ get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt); ADD_COUNTER(counters[i], bcnt, pcnt);
++i; /* macro does multi eval of i */ ++i; /* macro does multi eval of i */
cond_resched();
} }
} }
} }

View File

@ -795,6 +795,7 @@ get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt); ADD_COUNTER(counters[i], bcnt, pcnt);
++i; ++i;
cond_resched();
} }
} }
} }

View File

@ -215,7 +215,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
if (skip == hook_entries) if (skip == hook_entries)
goto out_assign; goto out_assign;
if (WARN_ON(skip == 0)) if (skip == 0)
return NULL; return NULL;
hook_entries -= skip; hook_entries -= skip;

View File

@ -24,10 +24,14 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
if (sh) { if (sh) {
sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sch = skb_header_pointer(skb, iph->len + sizeof(_sctph),
sizeof(_schunkh), &_schunkh); sizeof(_schunkh), &_schunkh);
if (sch && (sch->type == SCTP_CID_INIT || if (sch) {
sysctl_sloppy_sctp(ipvs))) if (sch->type == SCTP_CID_ABORT ||
!(sysctl_sloppy_sctp(ipvs) ||
sch->type == SCTP_CID_INIT))
return 1;
ports = &sh->source; ports = &sh->source;
} }
}
} else { } else {
ports = skb_header_pointer( ports = skb_header_pointer(
skb, iph->len, sizeof(_ports), &_ports); skb, iph->len, sizeof(_ports), &_ports);

View File

@ -30,19 +30,17 @@
#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h> #include <linux/netfilter/nf_nat.h>
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
static DEFINE_MUTEX(nf_nat_proto_mutex); static DEFINE_MUTEX(nf_nat_proto_mutex);
static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly; __read_mostly;
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly; __read_mostly;
struct nf_nat_conn_key { static struct hlist_head *nf_nat_bysource __read_mostly;
const struct net *net; static unsigned int nf_nat_htable_size __read_mostly;
const struct nf_conntrack_tuple *tuple; static unsigned int nf_nat_hash_rnd __read_mostly;
const struct nf_conntrack_zone *zone;
};
static struct rhltable nf_nat_bysource_table;
inline const struct nf_nat_l3proto * inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family) __nf_nat_l3proto_find(u8 family)
@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
EXPORT_SYMBOL(nf_xfrm_me_harder); EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */ #endif /* CONFIG_XFRM */
static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) /* We keep an extra hash for each conntrack, for fast searching. */
static unsigned int
hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{ {
const struct nf_conntrack_tuple *t; unsigned int hash;
const struct nf_conn *ct = data;
get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
/* Original src, to ensure we map it consistently if poss. */ /* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
seed ^= net_hash_mix(nf_ct_net(ct)); return reciprocal_scale(hash, nf_nat_htable_size);
return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
t->dst.protonum ^ seed);
} }
/* Is this tuple already taken? (not by us) */ /* Is this tuple already taken? (not by us) */
@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct,
t->src.u.all == tuple->src.u.all); t->src.u.all == tuple->src.u.all);
} }
static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct nf_nat_conn_key *key = arg->key;
const struct nf_conn *ct = obj;
if (!same_src(ct, key->tuple) ||
!net_eq(nf_ct_net(ct), key->net) ||
!nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
return 1;
return 0;
}
static struct rhashtable_params nf_nat_bysource_params = {
.head_offset = offsetof(struct nf_conn, nat_bysource),
.obj_hashfn = nf_nat_bysource_hash,
.obj_cmpfn = nf_nat_bysource_cmp,
.nelem_hint = 256,
.min_size = 1024,
};
/* Only called for SRC manip */ /* Only called for SRC manip */
static int static int
find_appropriate_src(struct net *net, find_appropriate_src(struct net *net,
@ -216,18 +194,14 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result, struct nf_conntrack_tuple *result,
const struct nf_nat_range *range) const struct nf_nat_range *range)
{ {
unsigned int h = hash_by_src(net, tuple);
const struct nf_conn *ct; const struct nf_conn *ct;
struct nf_nat_conn_key key = {
.net = net,
.tuple = tuple,
.zone = zone
};
struct rhlist_head *hl, *h;
hl = rhltable_lookup(&nf_nat_bysource_table, &key, hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
nf_nat_bysource_params); if (same_src(ct, tuple) &&
net_eq(net, nf_ct_net(ct)) &&
rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result, nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple); &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst; result->dst = tuple->dst;
@ -235,7 +209,7 @@ find_appropriate_src(struct net *net,
if (in_range(l3proto, l4proto, result, range)) if (in_range(l3proto, l4proto, result, range))
return 1; return 1;
} }
}
return 0; return 0;
} }
@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range, const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype) enum nf_nat_manip_type maniptype)
{ {
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conntrack_tuple curr_tuple, new_tuple;
/* Can't setup nat info for confirmed ct. */ /* Can't setup nat info for confirmed ct. */
@ -416,7 +391,9 @@ nf_nat_setup_info(struct nf_conn *ct,
WARN_ON(maniptype != NF_NAT_MANIP_SRC && WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
maniptype != NF_NAT_MANIP_DST); maniptype != NF_NAT_MANIP_DST);
BUG_ON(nf_nat_initialized(ct, maniptype));
if (WARN_ON(nf_nat_initialized(ct, maniptype)))
return NF_DROP;
/* What we've got will look like inverse of reply. Normally /* What we've got will look like inverse of reply. Normally
* this is what is in the conntrack, except for prior * this is what is in the conntrack, except for prior
@ -447,19 +424,16 @@ nf_nat_setup_info(struct nf_conn *ct,
} }
if (maniptype == NF_NAT_MANIP_SRC) { if (maniptype == NF_NAT_MANIP_SRC) {
struct nf_nat_conn_key key = { unsigned int srchash;
.net = nf_ct_net(ct), spinlock_t *lock;
.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
.zone = nf_ct_zone(ct),
};
int err;
err = rhltable_insert_key(&nf_nat_bysource_table, srchash = hash_by_src(net,
&key, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
&ct->nat_bysource, lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)];
nf_nat_bysource_params); spin_lock_bh(lock);
if (err) hlist_add_head_rcu(&ct->nat_bysource,
return NF_DROP; &nf_nat_bysource[srchash]);
spin_unlock_bh(lock);
} }
/* It's done. */ /* It's done. */
@ -553,6 +527,16 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
return i->status & IPS_NAT_MASK ? 1 : 0; return i->status & IPS_NAT_MASK ? 1 : 0;
} }
static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
unsigned int h;
h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
hlist_del_rcu(&ct->nat_bysource);
spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
}
static int nf_nat_proto_clean(struct nf_conn *ct, void *data) static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
{ {
if (nf_nat_proto_remove(ct, data)) if (nf_nat_proto_remove(ct, data))
@ -568,8 +552,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table. * will delete entry from already-freed table.
*/ */
clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, __nf_nat_cleanup_conntrack(ct);
nf_nat_bysource_params);
/* don't delete conntrack. Although that would make things a lot /* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod. * simpler, we'd end up flushing all conntracks on nat rmmod.
@ -698,8 +681,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
static void nf_nat_cleanup_conntrack(struct nf_conn *ct) static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{ {
if (ct->status & IPS_SRC_NAT_DONE) if (ct->status & IPS_SRC_NAT_DONE)
rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, __nf_nat_cleanup_conntrack(ct);
nf_nat_bysource_params);
} }
static struct nf_ct_ext_type nat_extend __read_mostly = { static struct nf_ct_ext_type nat_extend __read_mostly = {
@ -821,19 +803,27 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
static int __init nf_nat_init(void) static int __init nf_nat_init(void)
{ {
int ret; int ret, i;
ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); /* Leave them the same for the moment. */
if (ret) nf_nat_htable_size = nf_conntrack_htable_size;
return ret; if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks))
nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks);
nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
if (!nf_nat_bysource)
return -ENOMEM;
ret = nf_ct_extend_register(&nat_extend); ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) { if (ret < 0) {
rhltable_destroy(&nf_nat_bysource_table); nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret; return ret;
} }
for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++)
spin_lock_init(&nf_nat_locks[i]);
nf_ct_helper_expectfn_register(&follow_master_nat); nf_ct_helper_expectfn_register(&follow_master_nat);
BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
@ -863,8 +853,8 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++) for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]); kfree(nf_nat_l4protos[i]);
synchronize_net();
rhltable_destroy(&nf_nat_bysource_table); nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
} }
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");

View File

@ -35,6 +35,7 @@
#include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/xt_hashlimit.h> #include <linux/netfilter/xt_hashlimit.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/kernel.h>
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
@ -279,7 +280,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
size = cfg->size; size = cfg->size;
} else { } else {
size = (totalram_pages << PAGE_SHIFT) / 16384 / size = (totalram_pages << PAGE_SHIFT) / 16384 /
sizeof(struct list_head); sizeof(struct hlist_head);
if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
size = 8192; size = 8192;
if (size < 16) if (size < 16)
@ -287,7 +288,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
} }
/* FIXME: don't use vmalloc() here or anywhere else -HW */ /* FIXME: don't use vmalloc() here or anywhere else -HW */
hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
sizeof(struct list_head) * size); sizeof(struct hlist_head) * size);
if (hinfo == NULL) if (hinfo == NULL)
return -ENOMEM; return -ENOMEM;
*out_hinfo = hinfo; *out_hinfo = hinfo;
@ -527,12 +528,12 @@ static u64 user2rate(u64 user)
} }
} }
static u64 user2rate_bytes(u64 user) static u64 user2rate_bytes(u32 user)
{ {
u64 r; u64 r;
r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL; r = user ? U32_MAX / user : U32_MAX;
r = (r - 1) << 4; r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
return r; return r;
} }
@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
dh->rateinfo.prev_window = 0; dh->rateinfo.prev_window = 0;
dh->rateinfo.current_rate = 0; dh->rateinfo.current_rate = 0;
if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg); dh->rateinfo.rate =
user2rate_bytes((u32)hinfo->cfg.avg);
if (hinfo->cfg.burst) if (hinfo->cfg.burst)
dh->rateinfo.burst = dh->rateinfo.burst =
hinfo->cfg.burst * dh->rateinfo.rate; hinfo->cfg.burst * dh->rateinfo.rate;
@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
/* Check for overflow. */ /* Check for overflow. */
if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) { if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
if (cfg->avg == 0) { if (cfg->avg == 0 || cfg->avg > U32_MAX) {
pr_info("hashlimit invalid rate\n"); pr_info("hashlimit invalid rate\n");
return -ERANGE; return -ERANGE;
} }