From 61792b677415b77c8db04991c22966bb8de7603e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Oct 2018 16:47:16 +0200 Subject: [PATCH 01/14] netfilter: ipv6: fix oops when defragmenting locally generated fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike ipv4 and normal ipv6 defrag, netfilter ipv6 defragmentation did not save/restore skb->dst. This causes oops when handling locally generated ipv6 fragments, as output path needs a valid dst. Reported-by: Maciej Żenczykowski Fixes: 84379c9afe01 ("netfilter: ipv6: nf_defrag: drop skb dst before queueing") Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b8ac369f98ad..d219979c3e52 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -587,11 +587,16 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) */ ret = -EINPROGRESS; if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len && - nf_ct_frag6_reasm(fq, skb, dev)) - ret = 0; - else + fq->q.meat == fq->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + if (nf_ct_frag6_reasm(fq, skb, dev)) + ret = 0; + skb->_skb_refdst = orefdst; + } else { skb_dst_drop(skb); + } out_unlock: spin_unlock_bh(&fq->q.lock); From 5e91c9d9cd3fd557226ca75fed58816b9eee7e07 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Oct 2018 21:49:45 +0200 Subject: [PATCH 02/14] netfilter: nft_osf: check if attribute is present If the attribute is not sent, eg. old libnftnl binary, then tb[NFTA_OSF_TTL] is NULL and kernel crashes from the _init path. Fixes: a218dc82f0b5 ("netfilter: nft_osf: Add ttl option support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index ca5e5d8c5ef8..b13618c764ec 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -50,7 +50,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; - if (nla_get_u8(tb[NFTA_OSF_TTL])) { + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) return -EINVAL; From 5a8de47b3c250521dd632cdedaac6db88367defa Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 24 Oct 2018 13:54:03 +0200 Subject: [PATCH 03/14] netfilter: bridge: define INT_MIN & INT_MAX in userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 4.19, programs like ebtables fail to build when they include "linux/netfilter_bridge.h". It is caused by commit 94276fa8a2a4 which added a use of INT_MIN and INT_MAX to the header: : In file included from /usr/include/linux/netfilter_bridge/ebtables.h:18, : from include/ebtables_u.h:28, : from communication.c:23: : /usr/include/linux/netfilter_bridge.h:30:20: error: 'INT_MIN' undeclared here (not in a function) : NF_BR_PRI_FIRST = INT_MIN, : ^~~~~~~ Define these constants by including "limits.h" when !__KERNEL__ (the same way as for other netfilter_* headers). Fixes: 94276fa8a2a4 ("netfilter: bridge: Expose nf_tables bridge hook priorities through uapi") Signed-off-by: Jiri Slaby Acked-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index 156ccd089df1..1610fdbab98d 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -11,6 +11,10 @@ #include #include +#ifndef __KERNEL__ +#include /* for INT_MIN, INT_MAX */ +#endif + /* Bridge Hooks */ /* After promisc drops, checksum checks. */ #define NF_BR_PRE_ROUTING 0 From 4269fea768a11a447d8de620ce420f2214d4685c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Oct 2018 11:14:28 +0200 Subject: [PATCH 04/14] Revert "netfilter: nft_numgen: add map lookups for numgen random operations" Laura found a better way to do this from userspace without requiring kernel infrastructure, revert this. Fixes: 978d8f9055c3 ("netfilter: nft_numgen: add map lookups for numgen random operations") Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +- net/netfilter/nft_numgen.c | 127 ----------------------- 2 files changed, 2 insertions(+), 129 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 579974b0bf0d..7de4f1bdaf06 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1635,8 +1635,8 @@ enum nft_ng_attributes { NFTA_NG_MODULUS, NFTA_NG_TYPE, NFTA_NG_OFFSET, - NFTA_NG_SET_NAME, - NFTA_NG_SET_ID, + NFTA_NG_SET_NAME, /* deprecated */ + NFTA_NG_SET_ID, /* deprecated */ __NFTA_NG_MAX }; #define NFTA_NG_MAX (__NFTA_NG_MAX - 1) diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 649d1700ec5b..3cc1b3dc3c3c 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -24,7 +24,6 @@ struct nft_ng_inc { u32 modulus; atomic_t counter; u32 offset; - struct nft_set *map; }; static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) @@ -48,34 +47,11 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_inc_gen(priv); } -static void nft_ng_inc_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_inc_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, [NFTA_NG_OFFSET] = { .type = NLA_U32 }, - [NFTA_NG_SET_NAME] = { .type = NLA_STRING, - .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_NG_SET_ID] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -101,22 +77,6 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_inc_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_inc_init(ctx, expr, tb); - - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { @@ -143,27 +103,10 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_inc_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_inc *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_INCREMENTAL, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; u32 offset; - struct nft_set *map; }; static u32 nft_ng_random_gen(struct nft_ng_random *priv) @@ -183,25 +126,6 @@ static void nft_ng_random_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_random_gen(priv); } -static void nft_ng_random_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_random_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static int nft_ng_random_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -226,21 +150,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_random_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_random_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); @@ -249,22 +158,6 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_random_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_random *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_RANDOM, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, @@ -274,14 +167,6 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .dump = nft_ng_inc_dump, }; -static const struct nft_expr_ops nft_ng_inc_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), - .eval = nft_ng_inc_map_eval, - .init = nft_ng_inc_map_init, - .dump = nft_ng_inc_map_dump, -}; - static const struct nft_expr_ops nft_ng_random_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), @@ -290,14 +175,6 @@ static const struct nft_expr_ops nft_ng_random_ops = { .dump = nft_ng_random_dump, }; -static const struct nft_expr_ops nft_ng_random_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), - .eval = nft_ng_random_map_eval, - .init = nft_ng_random_map_init, - .dump = nft_ng_random_map_dump, -}; - static const struct nft_expr_ops * nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { @@ -312,12 +189,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) switch (type) { case NFT_NG_INCREMENTAL: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_inc_map_ops; return &nft_ng_inc_ops; case NFT_NG_RANDOM: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_random_map_ops; return &nft_ng_random_ops; } From 439cd39ea136d2c026805264d58a91f36b6b64ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sat, 14 Jul 2018 21:59:43 +0200 Subject: [PATCH 05/14] netfilter: ipset: list:set: Decrease refcount synchronously on deletion and replace Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") postponed decreasing set reference counters to the RCU callback. An 'ipset del' command can terminate before the RCU grace period is elapsed, and if sets are listed before then, the reference counter shown in userspace will be wrong: # ipset create h hash:ip; ipset create l list:set; ipset add l # ipset del l h; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 1 Number of entries: 0 Members: # sleep 1; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 0 Number of entries: 0 Members: Fix this by making the reference count update synchronous again. As a result, when sets are listed, ip_set_name_byindex() might now fetch a set whose reference count is already zero. Instead of relying on the reference count to protect against concurrent set renaming, grab ip_set_ref_lock as reader and copy the name, while holding the same lock in ip_set_rename() as writer instead. Reported-by: Li Shuang Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") Signed-off-by: Stefano Brivio Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 2 +- net/netfilter/ipset/ip_set_core.c | 23 +++++++++++------------ net/netfilter/ipset/ip_set_list_set.c | 17 +++++++++++------ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 34fc80f3eb90..1d100efe74ec 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -314,7 +314,7 @@ enum { extern ip_set_id_t ip_set_get_byname(struct net *net, const char *name, struct ip_set **set); extern void ip_set_put_byindex(struct net *net, ip_set_id_t index); -extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index); +extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name); extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index); extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bc4bd247bb7d..fa15a831aeee 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -693,21 +693,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. - * We assume the set is referenced, so it does exist and - * can't be destroyed. The set cannot be renamed due to - * the referencing either. - * + * Set itself is protected by RCU, but its name isn't: to protect against + * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the + * name. */ -const char * -ip_set_name_byindex(struct net *net, ip_set_id_t index) +void +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { - const struct ip_set *set = ip_set_rcu_get(net, index); + struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); - BUG_ON(set->ref == 0); - /* Referenced, so it's safe */ - return set->name; + read_lock_bh(&ip_set_ref_lock); + strncpy(name, set->name, IPSET_MAXNAMELEN); + read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1153,7 +1152,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, if (!set) return -ENOENT; - read_lock_bh(&ip_set_ref_lock); + write_lock_bh(&ip_set_ref_lock); if (set->ref != 0) { ret = -IPSET_ERR_REFERENCED; goto out; @@ -1170,7 +1169,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, strncpy(set->name, name2, IPSET_MAXNAMELEN); out: - read_unlock_bh(&ip_set_ref_lock); + write_unlock_bh(&ip_set_ref_lock); return ret; } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 072a658fde04..4eef55da0878 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu) { struct set_elem *e = container_of(rcu, struct set_elem, rcu); struct ip_set *set = e->set; - struct list_set *map = set->data; - ip_set_put_byindex(map->net, e->id); ip_set_ext_destroy(set, e); kfree(e); } @@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu) static inline void list_set_del(struct ip_set *set, struct set_elem *e) { + struct list_set *map = set->data; + set->elements--; list_del_rcu(&e->list); + ip_set_put_byindex(map->net, e->id); call_rcu(&e->rcu, __list_set_del_rcu); } static inline void -list_set_replace(struct set_elem *e, struct set_elem *old) +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { + struct list_set *map = set->data; + list_replace_rcu(&old->list, &e->list); + ip_set_put_byindex(map->net, old->id); call_rcu(&old->rcu, __list_set_del_rcu); } @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) - list_set_replace(e, n); + list_set_replace(set, e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set, const struct list_set *map = set->data; struct nlattr *atd, *nested; u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + char name[IPSET_MAXNAMELEN]; struct set_elem *e; int ret = 0; @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set, nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; - if (nla_put_string(skb, IPSET_ATTR_NAME, - ip_set_name_byindex(map->net, e->id))) + ip_set_name_byindex(map->net, e->id, name); + if (nla_put_string(skb, IPSET_ATTR_NAME, name)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; From 886503f34d63e681662057448819edb5b1057a97 Mon Sep 17 00:00:00 2001 From: Eric Westbrook Date: Tue, 28 Aug 2018 15:14:42 -0600 Subject: [PATCH 06/14] netfilter: ipset: actually allow allowable CIDR 0 in hash:net,port,net Allow /0 as advertised for hash:net,port,net sets. For "hash:net,port,net", ipset(8) says that "either subnet is permitted to be a /0 should you wish to match port between all destinations." Make that statement true. Before: # ipset create cidrzero hash:net,port,net # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0 ipset v6.34: The value of the CIDR parameter of the IP address is invalid # ipset create cidrzero6 hash:net,port,net family inet6 # ipset add cidrzero6 ::/0,12345,::/0 ipset v6.34: The value of the CIDR parameter of the IP address is invalid After: # ipset create cidrzero hash:net,port,net # ipset add cidrzero 0.0.0.0/0,12345,0.0.0.0/0 # ipset test cidrzero 192.168.205.129,12345,172.16.205.129 192.168.205.129,tcp:12345,172.16.205.129 is in set cidrzero. # ipset create cidrzero6 hash:net,port,net family inet6 # ipset add cidrzero6 ::/0,12345,::/0 # ipset test cidrzero6 fe80::1,12345,ff00::1 fe80::1,tcp:12345,ff00::1 is in set cidrzero6. See also: https://bugzilla.kernel.org/show_bug.cgi?id=200897 https://github.com/ewestbrook/linux/commit/df7ff6efb0934ab6acc11f003ff1a7580d6c1d9c Signed-off-by: Eric Westbrook Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_netportnet.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index d391485a6acd..613e18e720a4 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -213,13 +213,13 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } @@ -493,13 +493,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } From ed956f3947a01ff9875cd908d7c1ef1fe7f47bf0 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 22 Oct 2018 23:30:40 +0200 Subject: [PATCH 07/14] netfilter: ipset: fix ip_set_list allocation failure ip_set_create() and ip_set_net_init() attempt to allocate physically contiguous memory for ip_set_list. If memory is fragmented, the allocations could easily fail: vzctl: page allocation failure: order:7, mode:0xc0d0 Call Trace: dump_stack+0x19/0x1b warn_alloc_failed+0x110/0x180 __alloc_pages_nodemask+0x7bf/0xc60 alloc_pages_current+0x98/0x110 kmalloc_order+0x18/0x40 kmalloc_order_trace+0x26/0xa0 __kmalloc+0x279/0x290 ip_set_net_init+0x4b/0x90 [ip_set] ops_init+0x3b/0xb0 setup_net+0xbb/0x170 copy_net_ns+0xf1/0x1c0 create_new_namespaces+0xf9/0x180 copy_namespaces+0x8e/0xd0 copy_process+0xb61/0x1a00 do_fork+0x91/0x320 Use kvcalloc() to fallback to 0-order allocations if high order page isn't available. Signed-off-by: Andrey Ryabinin Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index fa15a831aeee..68db946df151 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -960,7 +960,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Wraparound */ goto cleanup; - list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -972,7 +972,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Use new list */ index = inst->ip_set_max; inst->ip_set_max = i; - kfree(tmp); + kvfree(tmp); ret = 0; } else if (ret) { goto cleanup; @@ -2058,7 +2058,7 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; @@ -2086,7 +2086,7 @@ ip_set_net_exit(struct net *net) } } nfnl_unlock(NFNL_SUBSYS_IPSET); - kfree(rcu_dereference_protected(inst->ip_set_list, 1)); + kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { From 17b8b74c0f8dbf9b9e3301f9ca5b65dd1c079951 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 19 Oct 2018 19:35:19 +0200 Subject: [PATCH 08/14] netfilter: ipset: Correct rcu_dereference() call in ip_set_put_comment() The function is called when rcu_read_lock() is held and not when rcu_read_lock_bh() is held. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set_comment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h index 8e2bab1e8e90..70877f8de7e9 100644 --- a/include/linux/netfilter/ipset/ip_set_comment.h +++ b/include/linux/netfilter/ipset/ip_set_comment.h @@ -43,11 +43,11 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, rcu_assign_pointer(comment->c, c); } -/* Used only when dumping a set, protected by rcu_read_lock_bh() */ +/* Used only when dumping a set, protected by rcu_read_lock() */ static inline int ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) { - struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c); + struct ip_set_comment_rcu *c = rcu_dereference(comment->c); if (!c) return 0; From 54451f60c8fa061af9051a53be9786393947367c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 21 Oct 2018 00:00:08 +0900 Subject: [PATCH 09/14] netfilter: xt_IDLETIMER: add sysfs filename checking routine When IDLETIMER rule is added, sysfs file is created under /sys/class/xt_idletimer/timers/ But some label name shouldn't be used. ".", "..", "power", "uevent", "subsystem", etc... So that sysfs filename checking routine is needed. test commands: %iptables -I INPUT -j IDLETIMER --timeout 1 --label "power" splat looks like: [95765.423132] sysfs: cannot create duplicate filename '/devices/virtual/xt_idletimer/timers/power' [95765.433418] CPU: 0 PID: 8446 Comm: iptables Not tainted 4.19.0-rc6+ #20 [95765.449755] Call Trace: [95765.449755] dump_stack+0xc9/0x16b [95765.449755] ? show_regs_print_info+0x5/0x5 [95765.449755] sysfs_warn_dup+0x74/0x90 [95765.449755] sysfs_add_file_mode_ns+0x352/0x500 [95765.449755] sysfs_create_file_ns+0x179/0x270 [95765.449755] ? sysfs_add_file_mode_ns+0x500/0x500 [95765.449755] ? idletimer_tg_checkentry+0x3e5/0xb1b [xt_IDLETIMER] [95765.449755] ? rcu_read_lock_sched_held+0x114/0x130 [95765.449755] ? __kmalloc_track_caller+0x211/0x2b0 [95765.449755] ? memcpy+0x34/0x50 [95765.449755] idletimer_tg_checkentry+0x4e2/0xb1b [xt_IDLETIMER] [ ... ] Fixes: 0902b469bd25 ("netfilter: xtables: idletimer target implementation") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_IDLETIMER.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index c6acfc2d9c84..eb4cbd244c3d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -114,6 +114,22 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } +static int idletimer_check_sysfs_name(const char *name, unsigned int size) +{ + int ret; + + ret = xt_check_proc_name(name, size); + if (ret < 0) + return ret; + + if (!strcmp(name, "power") || + !strcmp(name, "subsystem") || + !strcmp(name, "uevent")) + return -EINVAL; + + return 0; +} + static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; @@ -124,6 +140,10 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); + if (ret < 0) + goto out_free_timer; + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { From 8a02bdd50b2ecb6d62121d2958d3ea186cc88ce7 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 30 Oct 2018 22:43:42 +0100 Subject: [PATCH 10/14] netfilter: ipset: Fix calling ip_set() macro at dumping The ip_set() macro is called when either ip_set_ref_lock held only or no lock/nfnl mutex is held at dumping. Take this into account properly. Also, use Pablo's suggestion to use rcu_dereference_raw(), the ref_netlink protects the set. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 68db946df151..1577f2f76060 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -55,11 +55,15 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); -/* When the nfnl mutex is held: */ +/* When the nfnl mutex or ip_set_ref_lock is held: */ #define ip_set_dereference(p) \ - rcu_dereference_protected(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&ip_set_ref_lock)) #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] +#define ip_set_ref_netlink(inst,id) \ + rcu_dereference_raw((inst)->ip_set_list)[id] /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -1251,7 +1255,7 @@ ip_set_dump_done(struct netlink_callback *cb) struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; - struct ip_set *set = ip_set(inst, index); + struct ip_set *set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); @@ -1440,7 +1444,7 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - set = ip_set(inst, index); + set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); From a95a7774d51e13f9cf4b7285666829b68852f07a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Nov 2018 00:11:34 +0100 Subject: [PATCH 11/14] netfilter: conntrack: add nf_{tcp,udp,sctp,icmp,dccp,icmpv6,generic}_pernet() Expose these functions to access conntrack protocol tracker netns area, nfnetlink_cttimeout needs this. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 39 ++++++++++++++++++++ net/netfilter/nf_conntrack_proto_dccp.c | 13 ++----- net/netfilter/nf_conntrack_proto_generic.c | 11 ++---- net/netfilter/nf_conntrack_proto_icmp.c | 11 ++---- net/netfilter/nf_conntrack_proto_icmpv6.c | 11 ++---- net/netfilter/nf_conntrack_proto_sctp.c | 11 ++---- net/netfilter/nf_conntrack_proto_tcp.c | 15 +++----- net/netfilter/nf_conntrack_proto_udp.c | 11 ++---- 8 files changed, 63 insertions(+), 59 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index eed04af9b75e..ae7b86f587f2 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -153,4 +153,43 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ +static inline struct nf_generic_net *nf_generic_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.generic; +} + +static inline struct nf_tcp_net *nf_tcp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.tcp; +} + +static inline struct nf_udp_net *nf_udp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.udp; +} + +static inline struct nf_icmp_net *nf_icmp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp; +} + +static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6; +} + +#ifdef CONFIG_NF_CT_PROTO_DCCP +static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.dccp; +} +#endif + +#ifdef CONFIG_NF_CT_PROTO_SCTP +static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.sctp; +} +#endif + #endif /*_NF_CONNTRACK_PROTOCOL_H*/ diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 171e9e122e5f..023c1445bc39 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -384,11 +384,6 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = }, }; -static inline struct nf_dccp_net *dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} - static noinline bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct dccp_hdr *dh) @@ -401,7 +396,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb, state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: - dn = dccp_pernet(net); + dn = nf_dccp_pernet(net); if (dn->dccp_loose == 0) { msg = "not picking up existing connection "; goto out_invalid; @@ -568,7 +563,7 @@ static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout; + timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; @@ -681,7 +676,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); unsigned int *timeouts = data; int i; @@ -814,7 +809,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, static int dccp_init_net(struct net *net) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); struct nf_proto_net *pn = &dn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e10e867e0b55..5da19d5fbc76 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -27,11 +27,6 @@ static bool nf_generic_should_process(u8 proto) } } -static inline struct nf_generic_net *generic_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.generic; -} - static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) @@ -58,7 +53,7 @@ static int generic_packet(struct nf_conn *ct, } if (!timeout) - timeout = &generic_pernet(nf_ct_net(ct))->timeout; + timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -72,7 +67,7 @@ static int generic_packet(struct nf_conn *ct, static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); unsigned int *timeout = data; if (!timeout) @@ -138,7 +133,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, static int generic_init_net(struct net *net) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); struct nf_proto_net *pn = &gn->pn; gn->timeout = nf_ct_generic_timeout; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 3598520bd19b..de64d8a5fdfd 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -25,11 +25,6 @@ static const unsigned int nf_ct_icmp_timeout = 30*HZ; -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { @@ -103,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct, } if (!timeout) - timeout = &icmp_pernet(nf_ct_net(ct))->timeout; + timeout = &nf_icmp_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -275,7 +270,7 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { if (!timeout) @@ -337,7 +332,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int icmp_init_net(struct net *net) { - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 378618feed5d..a15eefb8e317 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -30,11 +30,6 @@ static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; -static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6; -} - static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, @@ -87,7 +82,7 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, static unsigned int *icmpv6_get_timeouts(struct net *net) { - return &icmpv6_pernet(net)->timeout; + return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ @@ -286,7 +281,7 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); @@ -348,7 +343,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, static int icmpv6_init_net(struct net *net) { - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmpv6_timeout; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 3d719d3eb9a3..d53e3e78f605 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -146,11 +146,6 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { } }; -static inline struct nf_sctp_net *sctp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.sctp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -480,7 +475,7 @@ static int sctp_packet(struct nf_conn *ct, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = sctp_pernet(nf_ct_net(ct))->timeouts; + timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); @@ -599,7 +594,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; /* set default SCTP timeouts. */ @@ -736,7 +731,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int sctp_init_net(struct net *net) { - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1bcf9984d45e..4dcbd51a8e97 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -272,11 +272,6 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { } }; -static inline struct nf_tcp_net *tcp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.tcp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -475,7 +470,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; @@ -767,7 +762,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); - const struct nf_tcp_net *tn = tcp_pernet(net); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; @@ -841,7 +836,7 @@ static int tcp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; @@ -1283,7 +1278,7 @@ static unsigned int tcp_nlattr_tuple_size(void) static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); unsigned int *timeouts = data; int i; @@ -1508,7 +1503,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int tcp_init_net(struct net *net) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; if (!pn->users) { diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index a7aa70370913..c879d8d78cfd 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -32,14 +32,9 @@ static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_REPLIED] = 180*HZ, }; -static inline struct nf_udp_net *udp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.udp; -} - static unsigned int *udp_get_timeouts(struct net *net) { - return udp_pernet(net)->timeouts; + return nf_udp_pernet(net)->timeouts; } static void udp_error_log(const struct sk_buff *skb, @@ -212,7 +207,7 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; @@ -292,7 +287,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int udp_init_net(struct net *net) { - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); struct nf_proto_net *pn = &un->pn; if (!pn->users) { From 8866df9264a34e675b4ee8a151db819b87cce2d3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Nov 2018 00:14:00 +0100 Subject: [PATCH 12/14] netfilter: nfnetlink_cttimeout: pass default timeout policy to obj_to_nlattr Otherwise, we hit a NULL pointer deference since handlers always assume default timeout policy is passed. netlink: 24 bytes leftover after parsing attributes in process `syz-executor2'. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 9575 Comm: syz-executor1 Not tainted 4.19.0+ #312 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:icmp_timeout_obj_to_nlattr+0x77/0x170 net/netfilter/nf_conntrack_proto_icmp.c:297 Fixes: c779e849608a ("netfilter: conntrack: remove get_timeout() indirection") Reported-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cttimeout.c | 47 ++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index e7a50af1b3d6..a518eb162344 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -382,7 +382,8 @@ err: static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 l3num, - const struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto, + const unsigned int *timeouts) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -408,7 +409,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); if (ret < 0) goto nla_put_failure; @@ -430,6 +431,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts = NULL; struct sk_buff *skb2; int ret, err; __u16 l3num; @@ -442,12 +444,44 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find_get(l4num); - /* This protocol is not supported, skip. */ - if (l4proto->l4proto != l4num) { - err = -EOPNOTSUPP; + err = -EOPNOTSUPP; + if (l4proto->l4proto != l4num) goto err; + + switch (l4proto->l4proto) { + case IPPROTO_ICMP: + timeouts = &nf_icmp_pernet(net)->timeout; + break; + case IPPROTO_TCP: + timeouts = nf_tcp_pernet(net)->timeouts; + break; + case IPPROTO_UDP: + timeouts = nf_udp_pernet(net)->timeouts; + break; + case IPPROTO_DCCP: +#ifdef CONFIG_NF_CT_PROTO_DCCP + timeouts = nf_dccp_pernet(net)->dccp_timeout; +#endif + break; + case IPPROTO_ICMPV6: + timeouts = &nf_icmpv6_pernet(net)->timeout; + break; + case IPPROTO_SCTP: +#ifdef CONFIG_NF_CT_PROTO_SCTP + timeouts = nf_sctp_pernet(net)->timeouts; +#endif + break; + case 255: + timeouts = &nf_generic_pernet(net)->timeout; + break; + default: + WARN_ON_ONCE(1); + break; } + if (!timeouts) + goto err; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { err = -ENOMEM; @@ -458,8 +492,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, - l3num, - l4proto); + l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); err = -ENOMEM; From e4844c9c62a0fe47980d6c3d4b7a096a5d755925 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 2 Nov 2018 11:33:37 +0100 Subject: [PATCH 13/14] netfilter: nft_compat: ebtables 'nat' table is normal chain type Unlike ip(6)tables, the ebtables nat table has no special properties. This bug causes 'ebtables -A' to fail when using a target such as 'snat' (ebt_snat target sets ".table = "nat"'). Targets that have no table restrictions work fine. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 768292eac2a4..9d0ede474224 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -54,9 +54,11 @@ static bool nft_xt_put(struct nft_xt *xt) return false; } -static int nft_compat_chain_validate_dependency(const char *tablename, - const struct nft_chain *chain) +static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, + const char *tablename) { + enum nft_chain_types type = NFT_CHAIN_T_DEFAULT; + const struct nft_chain *chain = ctx->chain; const struct nft_base_chain *basechain; if (!tablename || @@ -64,9 +66,12 @@ static int nft_compat_chain_validate_dependency(const char *tablename, return 0; basechain = nft_base_chain(chain); - if (strcmp(tablename, "nat") == 0 && - basechain->type->type != NFT_CHAIN_T_NAT) - return -EINVAL; + if (strcmp(tablename, "nat") == 0) { + if (ctx->family != NFPROTO_BRIDGE) + type = NFT_CHAIN_T_NAT; + if (basechain->type->type != type) + return -EINVAL; + } return 0; } @@ -342,8 +347,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(target->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; } @@ -590,8 +594,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(match->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; } From f393808dc64149ccd0e5a8427505ba2974a59854 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Thu, 25 Oct 2018 12:15:43 -0700 Subject: [PATCH 14/14] netfilter: conntrack: fix calculation of next bucket number in early_drop If there's no entry to drop in bucket that corresponds to the hash, early_drop() should look for it in other buckets. But since it increments hash instead of bucket number, it actually looks in the same bucket 8 times: hsize is 16k by default (14 bits) and hash is 32-bit value, so reciprocal_scale(hash, hsize) returns the same value for hash..hash+7 in most cases. Fix it by increasing bucket number instead of hash and rename _hash to bucket to avoid future confusion. Fixes: 3e86638e9a0b ("netfilter: conntrack: consider ct netns in early_drop logic") Cc: # v4.7+ Signed-off-by: Vasily Khoruzhick Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ca1168d67fac..e92e749aff53 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1073,19 +1073,22 @@ static unsigned int early_drop_list(struct net *net, return drops; } -static noinline int early_drop(struct net *net, unsigned int _hash) +static noinline int early_drop(struct net *net, unsigned int hash) { - unsigned int i; + unsigned int i, bucket; for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; - unsigned int hash, hsize, drops; + unsigned int hsize, drops; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hsize); - hash = reciprocal_scale(_hash++, hsize); + if (!i) + bucket = reciprocal_scale(hash, hsize); + else + bucket = (bucket + 1) % hsize; - drops = early_drop_list(net, &ct_hash[hash]); + drops = early_drop_list(net, &ct_hash[bucket]); rcu_read_unlock(); if (drops) {