From a9382d9389a045ddc1e6f8e0595eb793f6933d68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Dec 2021 08:39:59 -0800 Subject: [PATCH 01/32] netfilter: nfnetlink: add netns refcount tracker to struct nfulnl_instance If compiled with CONFIG_NET_NS_REFCNT_TRACKER=y, using put_net_track() in nfulnl_instance_free_rcu() and get_net_track() in instance_create() might help us finding netns refcount imbalances. Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 691ef4cffdd9..7a3a91fc7ffa 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -66,6 +66,7 @@ struct nfulnl_instance { struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; struct net *net; + netns_tracker ns_tracker; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ u32 peer_portid; /* PORTID of the peer process */ @@ -140,7 +141,7 @@ static void nfulnl_instance_free_rcu(struct rcu_head *head) struct nfulnl_instance *inst = container_of(head, struct nfulnl_instance, rcu); - put_net(inst->net); + put_net_track(inst->net, &inst->ns_tracker); kfree(inst); module_put(THIS_MODULE); } @@ -187,7 +188,7 @@ instance_create(struct net *net, u_int16_t group_num, timer_setup(&inst->timer, nfulnl_timer, 0); - inst->net = get_net(net); + inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC); inst->peer_user_ns = user_ns; inst->peer_portid = portid; inst->group_num = group_num; From fc0d026a2fadd3a637677efa2e8bd3b70403f4aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Dec 2021 08:40:00 -0800 Subject: [PATCH 02/32] netfilter: nf_nat_masquerade: add netns refcount tracker to masq_dev_work If compiled with CONFIG_NET_NS_REFCNT_TRACKER=y, using put_net_track() in iterate_cleanup_work() and netns_tracker_alloc() in nf_nat_masq_schedule() might help us finding netns refcount imbalances. Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_masquerade.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index acd73f717a08..e32fac374608 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -12,6 +12,7 @@ struct masq_dev_work { struct work_struct work; struct net *net; + netns_tracker ns_tracker; union nf_inet_addr addr; int ifindex; int (*iter)(struct nf_conn *i, void *data); @@ -82,7 +83,7 @@ static void iterate_cleanup_work(struct work_struct *work) nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); - put_net(w->net); + put_net_track(w->net, &w->ns_tracker); kfree(w); atomic_dec(&masq_worker_count); module_put(THIS_MODULE); @@ -119,6 +120,7 @@ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, INIT_WORK(&w->work, iterate_cleanup_work); w->ifindex = ifindex; w->net = net; + netns_tracker_alloc(net, &w->ns_tracker, gfp_flags); w->iter = iter; if (addr) w->addr = *addr; From 0d1873a52289f7214e7e70d2d0552b617582269a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Dec 2021 00:06:44 +0100 Subject: [PATCH 03/32] netfilter: nf_tables: remove rcu read-size lock Chain stats are updated from the Netfilter hook path which already run under rcu read-size lock section. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index adc348056076..41c7509955e6 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -110,7 +110,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, base_chain = nft_base_chain(chain); - rcu_read_lock(); pstats = READ_ONCE(base_chain->stats); if (pstats) { local_bh_disable(); @@ -121,7 +120,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, u64_stats_update_end(&stats->syncp); local_bh_enable(); } - rcu_read_unlock(); } struct nft_jumpstack { From 8801d791b48732acb3741cad06170fce880ed148 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Dec 2021 00:09:49 +0100 Subject: [PATCH 04/32] netfilter: nft_payload: WARN_ON_ONCE instead of BUG BUG() is too harsh for unknown payload base, use WARN_ON_ONCE() instead. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index bd689938a2e0..f2e65df32a06 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -157,7 +157,8 @@ void nft_payload_eval(const struct nft_expr *expr, goto err; break; default: - BUG(); + WARN_ON_ONCE(1); + goto err; } offset += priv->offset; @@ -664,7 +665,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr, goto err; break; default: - BUG(); + WARN_ON_ONCE(1); + goto err; } csum_offset = offset + priv->csum_offset; From 4765473fefd4403b5eeca371637065b561522c50 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Dec 2021 00:10:12 +0100 Subject: [PATCH 05/32] netfilter: nf_tables: consolidate rule verdict trace call Add function to consolidate verdict tracing. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 39 ++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 41c7509955e6..d026890a9842 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -67,6 +67,36 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +static noinline void __nft_trace_verdict(struct nft_traceinfo *info, + const struct nft_chain *chain, + const struct nft_regs *regs) +{ + enum nft_trace_types type; + + switch (regs->verdict.code) { + case NFT_CONTINUE: + case NFT_RETURN: + type = NFT_TRACETYPE_RETURN; + break; + default: + type = NFT_TRACETYPE_RULE; + break; + } + + __nft_trace_packet(info, chain, type); +} + +static inline void nft_trace_verdict(struct nft_traceinfo *info, + const struct nft_chain *chain, + const struct nft_rule *rule, + const struct nft_regs *regs) +{ + if (static_branch_unlikely(&nft_trace_enabled)) { + info->rule = rule; + __nft_trace_verdict(info, chain, regs); + } +} + static bool nft_payload_fast_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -205,13 +235,13 @@ next_rule: break; } + nft_trace_verdict(&info, chain, rule, ®s); + switch (regs.verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_DROP: case NF_QUEUE: case NF_STOLEN: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RULE); return regs.verdict.code; } @@ -224,15 +254,10 @@ next_rule: stackptr++; fallthrough; case NFT_GOTO: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RULE); - chain = regs.verdict.chain; goto do_chain; case NFT_CONTINUE: case NFT_RETURN: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RETURN); break; default: WARN_ON(1); From 690d541739a36e23b0c81ab7c5a7d2a72dd22107 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Dec 2021 00:11:45 +0100 Subject: [PATCH 06/32] netfilter: nf_tables: replace WARN_ON by WARN_ON_ONCE for unknown verdicts Bug might trigger warning for each packet, call WARN_ON_ONCE instead. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index d026890a9842..d2ada666d889 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -260,7 +260,7 @@ next_rule: case NFT_RETURN: break; default: - WARN_ON(1); + WARN_ON_ONCE(1); } if (stackptr > 0) { From 023223dfbfb34fcc9b7dd41e21fbf9a5d5237989 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 17 Dec 2021 20:37:34 +0100 Subject: [PATCH 07/32] netfilter: nf_tables: make counter support built-in Make counter support built-in to allow for direct call in case of CONFIG_RETPOLINE. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 6 +++ net/netfilter/Kconfig | 6 --- net/netfilter/Makefile | 3 +- net/netfilter/nf_tables_core.c | 5 +++ net/netfilter/nft_counter.c | 58 +++++++------------------- 5 files changed, 27 insertions(+), 51 deletions(-) diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 0fa5a6d98a00..b6fb1fdff9b2 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -7,6 +7,7 @@ extern struct nft_expr_type nft_imm_type; extern struct nft_expr_type nft_cmp_type; +extern struct nft_expr_type nft_counter_type; extern struct nft_expr_type nft_lookup_type; extern struct nft_expr_type nft_bitwise_type; extern struct nft_expr_type nft_byteorder_type; @@ -21,6 +22,7 @@ extern struct nft_expr_type nft_last_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; #endif +extern struct nft_object_type nft_counter_obj_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); @@ -120,6 +122,8 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); +void nft_counter_init_seqcount(void); + struct nft_expr; struct nft_regs; struct nft_pktinfo; @@ -143,4 +147,6 @@ void nft_dynset_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); +void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 3646fc195e7d..ddc54b6d18ee 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -515,12 +515,6 @@ config NFT_FLOW_OFFLOAD This option adds the "flow_offload" expression that you can use to choose what flows are placed into the hardware. -config NFT_COUNTER - tristate "Netfilter nf_tables counter module" - help - This option adds the "counter" expression that you can use to - include packet and byte counters in a rule. - config NFT_CONNLIMIT tristate "Netfilter nf_tables connlimit module" depends on NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index aab20e575ecd..a135b1a46014 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -75,7 +75,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ - nft_chain_route.o nf_tables_offload.o \ + nft_counter.o nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o @@ -100,7 +100,6 @@ obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o -obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o obj-$(CONFIG_NFT_REDIR) += nft_redir.o diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index d2ada666d889..df5eda7c7554 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -169,6 +169,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr, X(e, nft_payload_eval); X(e, nft_cmp_eval); + X(e, nft_counter_eval); X(e, nft_meta_get_eval); X(e, nft_lookup_eval); X(e, nft_range_eval); @@ -292,18 +293,22 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_rt_type, &nft_exthdr_type, &nft_last_type, + &nft_counter_type, }; static struct nft_object_type *nft_basic_objects[] = { #ifdef CONFIG_NETWORK_SECMARK &nft_secmark_obj_type, #endif + &nft_counter_obj_type, }; int __init nf_tables_core_module_init(void) { int err, i, j = 0; + nft_counter_init_seqcount(); + for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) { err = nft_register_obj(nft_basic_objects[i]); if (err) diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 8edd3b3c173d..f179e8c3b0ca 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -13,6 +13,7 @@ #include #include #include +#include #include struct nft_counter { @@ -174,7 +175,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; -static struct nft_object_type nft_counter_obj_type; +struct nft_object_type nft_counter_obj_type; static const struct nft_object_ops nft_counter_obj_ops = { .type = &nft_counter_obj_type, .size = sizeof(struct nft_counter_percpu_priv), @@ -184,7 +185,7 @@ static const struct nft_object_ops nft_counter_obj_ops = { .dump = nft_counter_obj_dump, }; -static struct nft_object_type nft_counter_obj_type __read_mostly = { +struct nft_object_type nft_counter_obj_type __read_mostly = { .type = NFT_OBJECT_COUNTER, .ops = &nft_counter_obj_ops, .maxattr = NFTA_COUNTER_MAX, @@ -192,9 +193,8 @@ static struct nft_object_type nft_counter_obj_type __read_mostly = { .owner = THIS_MODULE, }; -static void nft_counter_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); @@ -275,7 +275,15 @@ static void nft_counter_offload_stats(struct nft_expr *expr, preempt_enable(); } -static struct nft_expr_type nft_counter_type; +void nft_counter_init_seqcount(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu)); +} + +struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)), @@ -289,7 +297,7 @@ static const struct nft_expr_ops nft_counter_ops = { .offload_stats = nft_counter_offload_stats, }; -static struct nft_expr_type nft_counter_type __read_mostly = { +struct nft_expr_type nft_counter_type __read_mostly = { .name = "counter", .ops = &nft_counter_ops, .policy = nft_counter_policy, @@ -297,39 +305,3 @@ static struct nft_expr_type nft_counter_type __read_mostly = { .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, }; - -static int __init nft_counter_module_init(void) -{ - int cpu, err; - - for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu)); - - err = nft_register_obj(&nft_counter_obj_type); - if (err < 0) - return err; - - err = nft_register_expr(&nft_counter_type); - if (err < 0) - goto err1; - - return 0; -err1: - nft_unregister_obj(&nft_counter_obj_type); - return err; -} - -static void __exit nft_counter_module_exit(void) -{ - nft_unregister_expr(&nft_counter_type); - nft_unregister_obj(&nft_counter_obj_type); -} - -module_init(nft_counter_module_init); -module_exit(nft_counter_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_EXPR("counter"); -MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER); -MODULE_DESCRIPTION("nftables counter rule support"); From 4a6fbdd801e882ee6ca5cdfdc3374f0ae263174c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Dec 2021 11:29:56 +0100 Subject: [PATCH 08/32] netfilter: conntrack: tag conntracks picked up in local out hook This allows to identify flows that originate from local machine in a followup patch. It would be possible to make this a ->status bit instead. For now I did not do that yet because I don't have a use-case for exposing this info to userspace. If one comes up the toggle can be replaced with a status bit. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index d24b0a34c8f0..871489df63c6 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -95,6 +95,7 @@ struct nf_conn { unsigned long status; u16 cpu; + u16 local_origin:1; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d7e313548066..bed0017cadb0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1747,6 +1747,9 @@ resolve_normal_ct(struct nf_conn *tmpl, return 0; if (IS_ERR(h)) return PTR_ERR(h); + + ct = nf_ct_tuplehash_to_ctrack(h); + ct->local_origin = state->hook == NF_INET_LOCAL_OUT; } ct = nf_ct_tuplehash_to_ctrack(h); From 878aed8db324bec64f3c3f956e64d5ae7375a5de Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Dec 2021 11:29:57 +0100 Subject: [PATCH 09/32] netfilter: nat: force port remap to prevent shadowing well-known ports If destination port is above 32k and source port below 16k assume this might cause 'port shadowing' where a 'new' inbound connection matches an existing one, e.g. inbound X:41234 -> Y:53 matches existing conntrack entry Z:53 -> X:4123, where Z got natted to X. In this case, new packet is natted to Z:53 which is likely unwanted. We avoid the rewrite for connections that originate from local host: port-shadowing is only possible with forwarded connections. Also adjust test case. v3: no need to call tuple_force_port_remap if already in random mode (Phil) Signed-off-by: Florian Westphal Acked-by: Phil Sutter Acked-by: Eric Garver Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 43 ++++++++++++++++++-- tools/testing/selftests/netfilter/nft_nat.sh | 5 ++- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ab9f6c75524d..3dd130487b5b 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -494,6 +494,38 @@ another_round: goto another_round; } +static bool tuple_force_port_remap(const struct nf_conntrack_tuple *tuple) +{ + u16 sp, dp; + + switch (tuple->dst.protonum) { + case IPPROTO_TCP: + sp = ntohs(tuple->src.u.tcp.port); + dp = ntohs(tuple->dst.u.tcp.port); + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + sp = ntohs(tuple->src.u.udp.port); + dp = ntohs(tuple->dst.u.udp.port); + break; + default: + return false; + } + + /* IANA: System port range: 1-1023, + * user port range: 1024-49151, + * private port range: 49152-65535. + * + * Linux default ephemeral port range is 32768-60999. + * + * Enforce port remapping if sport is significantly lower + * than dport to prevent NAT port shadowing, i.e. + * accidental match of 'new' inbound connection vs. + * existing outbound one. + */ + return sp < 16384 && dp >= 32768; +} + /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the @@ -507,11 +539,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { + bool random_port = range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL; const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); + if (maniptype == NF_NAT_MANIP_SRC && + !random_port && + !ct->local_origin) + random_port = tuple_force_port_remap(orig_tuple); + /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. @@ -520,8 +558,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, * So far, we don't do local source mappings, so multiple * manips not an issue. */ - if (maniptype == NF_NAT_MANIP_SRC && - !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { + if (maniptype == NF_NAT_MANIP_SRC && !random_port) { /* try the original tuple first */ if (in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { @@ -545,7 +582,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, */ /* Only bother mapping if it's not already in range and unique */ - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { + if (!random_port) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index d88867d2fed7..349a319a9e51 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -880,8 +880,9 @@ EOF return $ksft_skip fi - # test default behaviour. Packet from ns1 to ns0 is redirected to ns2. - test_port_shadow "default" "CLIENT" + # test default behaviour. Packet from ns1 to ns0 is not redirected + # due to automatic port translation. + test_port_shadow "default" "ROUTER" # test packet filter based mitigation: prevent forwarding of # packets claiming to come from the service port. From c42ba4290b2147aa033d17f22151494515655d77 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Dec 2021 15:10:55 +0100 Subject: [PATCH 10/32] netfilter: flowtable: remove ipv4/ipv6 modules Just place the structs and registration in the inet module. nf_flow_table_ipv6, nf_flow_table_ipv4 and nf_flow_table_inet share same module dependencies: nf_flow_table, nf_tables. before: text data bss dec hex filename 2278 1480 0 3758 eae nf_flow_table_inet.ko 1159 1352 0 2511 9cf nf_flow_table_ipv6.ko 1154 1352 0 2506 9ca nf_flow_table_ipv4.ko after: 2369 1672 0 4041 fc9 nf_flow_table_inet.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 8 ++---- net/ipv4/netfilter/Makefile | 3 -- net/ipv4/netfilter/nf_flow_table_ipv4.c | 37 ------------------------ net/ipv6/netfilter/Kconfig | 8 ++---- net/ipv6/netfilter/nf_flow_table_ipv6.c | 38 ------------------------- net/netfilter/nf_flow_table_inet.c | 26 +++++++++++++++++ 6 files changed, 30 insertions(+), 90 deletions(-) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 63cb953bd019..67087f95579f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -59,12 +59,8 @@ config NF_TABLES_ARP endif # NF_TABLES config NF_FLOW_TABLE_IPV4 - tristate "Netfilter flow table IPv4 module" - depends on NF_FLOW_TABLE - help - This option adds the flow table IPv4 support. - - To compile it as a module, choose M here. + tristate + select NF_FLOW_TABLE_INET config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index f38fb1368ddb..93bad1184251 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -24,9 +24,6 @@ obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o -# flow table support -obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o - # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index aba65fe90345..e69de29bb2d1 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include - -static struct nf_flowtable_type flowtable_ipv4 = { - .family = NFPROTO_IPV4, - .init = nf_flow_table_init, - .setup = nf_flow_table_offload_setup, - .action = nf_flow_rule_route_ipv4, - .free = nf_flow_table_free, - .hook = nf_flow_offload_ip_hook, - .owner = THIS_MODULE, -}; - -static int __init nf_flow_ipv4_module_init(void) -{ - nft_register_flowtable_type(&flowtable_ipv4); - - return 0; -} - -static void __exit nf_flow_ipv4_module_exit(void) -{ - nft_unregister_flowtable_type(&flowtable_ipv4); -} - -module_init(nf_flow_ipv4_module_init); -module_exit(nf_flow_ipv4_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso "); -MODULE_ALIAS_NF_FLOWTABLE(AF_INET); -MODULE_DESCRIPTION("Netfilter flow table support"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index f22233e44ee9..97d3d1b36dbc 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -48,12 +48,8 @@ endif # NF_TABLES_IPV6 endif # NF_TABLES config NF_FLOW_TABLE_IPV6 - tristate "Netfilter flow table IPv6 module" - depends on NF_FLOW_TABLE - help - This option adds the flow table IPv6 support. - - To compile it as a module, choose M here. + tristate + select NF_FLOW_TABLE_INET config NF_DUP_IPV6 tristate "Netfilter IPv6 packet duplication to alternate destination" diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index 667b8af2546a..e69de29bb2d1 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include -#include - -static struct nf_flowtable_type flowtable_ipv6 = { - .family = NFPROTO_IPV6, - .init = nf_flow_table_init, - .setup = nf_flow_table_offload_setup, - .action = nf_flow_rule_route_ipv6, - .free = nf_flow_table_free, - .hook = nf_flow_offload_ipv6_hook, - .owner = THIS_MODULE, -}; - -static int __init nf_flow_ipv6_module_init(void) -{ - nft_register_flowtable_type(&flowtable_ipv6); - - return 0; -} - -static void __exit nf_flow_ipv6_module_exit(void) -{ - nft_unregister_flowtable_type(&flowtable_ipv6); -} - -module_init(nf_flow_ipv6_module_init); -module_exit(nf_flow_ipv6_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso "); -MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); -MODULE_DESCRIPTION("Netfilter flow table IPv6 module"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index bc4126d8ef65..5c57ade6bd05 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -54,8 +54,30 @@ static struct nf_flowtable_type flowtable_inet = { .owner = THIS_MODULE, }; +static struct nf_flowtable_type flowtable_ipv4 = { + .family = NFPROTO_IPV4, + .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv4, + .free = nf_flow_table_free, + .hook = nf_flow_offload_ip_hook, + .owner = THIS_MODULE, +}; + +static struct nf_flowtable_type flowtable_ipv6 = { + .family = NFPROTO_IPV6, + .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv6, + .free = nf_flow_table_free, + .hook = nf_flow_offload_ipv6_hook, + .owner = THIS_MODULE, +}; + static int __init nf_flow_inet_module_init(void) { + nft_register_flowtable_type(&flowtable_ipv4); + nft_register_flowtable_type(&flowtable_ipv6); nft_register_flowtable_type(&flowtable_inet); return 0; @@ -64,6 +86,8 @@ static int __init nf_flow_inet_module_init(void) static void __exit nf_flow_inet_module_exit(void) { nft_unregister_flowtable_type(&flowtable_inet); + nft_unregister_flowtable_type(&flowtable_ipv6); + nft_unregister_flowtable_type(&flowtable_ipv4); } module_init(nf_flow_inet_module_init); @@ -71,5 +95,7 @@ module_exit(nf_flow_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */ MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module"); From 2b71e2c7b56cabe0aef95b79af98da921b87bc30 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 21 Dec 2021 19:37:57 +0000 Subject: [PATCH 11/32] netfilter: nft_set_pipapo_avx2: remove redundant pointer lt The pointer lt is being assigned a value and then later updated but that value is never read. The pointer is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo_avx2.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 6f4116e72958..52e0d026d30a 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1048,11 +1048,9 @@ static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { - unsigned long *lt = f->lt, bsize = f->bsize; + unsigned long bsize = f->bsize; int i, ret = -1, b; - lt += offset * NFT_PIPAPO_LONGS_PER_M256; - if (first) memset(map, 0xff, bsize * sizeof(*map)); From 613a0c67d12f33dcbeec2836f5fe60d05b4c18c0 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sun, 26 Dec 2021 01:12:41 +0800 Subject: [PATCH 12/32] netfilter: conntrack: Use max() instead of doing it manually Fix following coccicheck warning: ./include/net/netfilter/nf_conntrack.h:282:16-17: WARNING opportunity for max(). Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 871489df63c6..a4a14f3a5e38 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -279,7 +279,7 @@ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; - return timeout > 0 ? timeout : 0; + return max(timeout, 0); } static inline bool nf_ct_is_expired(const struct nf_conn *ct) From 719774377622bc4025d2a74f551b5dc2158c6c30 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:22 +0100 Subject: [PATCH 13/32] netfilter: conntrack: convert to refcount_t api Convert nf_conn reference counting from atomic_t to refcount_t based api. refcount_t api provides more runtime sanity checks and will warn on certain constructs, e.g. refcount_inc() on a zero reference count, which usually indicates use-after-free. For this reason template allocation is changed to init the refcount to 1, the subsequenct add operations are removed. Likewise, init_conntrack() is changed to set the initial refcount to 1 instead refcount_inc(). This is safe because the new entry is not (yet) visible to other cpus. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 8 +++--- net/netfilter/nf_conntrack_core.c | 26 +++++++++---------- net/netfilter/nf_conntrack_expect.c | 4 +-- net/netfilter/nf_conntrack_netlink.c | 6 ++--- net/netfilter/nf_conntrack_standalone.c | 4 +-- net/netfilter/nf_flow_table_core.c | 2 +- net/netfilter/nf_synproxy_core.c | 1 - net/netfilter/nft_ct.c | 4 +-- net/netfilter/xt_CT.c | 3 +-- net/openvswitch/conntrack.c | 1 - net/sched/act_ct.c | 1 - 11 files changed, 27 insertions(+), 33 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 700ea077ce2d..a03f7a80b9ab 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -2,7 +2,7 @@ #ifndef _NF_CONNTRACK_COMMON_H #define _NF_CONNTRACK_COMMON_H -#include +#include #include struct ip_conntrack_stat { @@ -25,19 +25,19 @@ struct ip_conntrack_stat { #define NFCT_PTRMASK ~(NFCT_INFOMASK) struct nf_conntrack { - atomic_t use; + refcount_t use; }; void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) { - if (nfct && atomic_dec_and_test(&nfct->use)) + if (nfct && refcount_dec_and_test(&nfct->use)) nf_conntrack_destroy(nfct); } static inline void nf_conntrack_get(struct nf_conntrack *nfct) { if (nfct) - atomic_inc(&nfct->use); + refcount_inc(&nfct->use); } #endif /* _NF_CONNTRACK_COMMON_H */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bed0017cadb0..328d359fcabe 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -585,7 +585,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); nf_ct_zone_add(tmpl, zone); - atomic_set(&tmpl->ct_general.use, 0); + refcount_set(&tmpl->ct_general.use, 1); return tmpl; } @@ -618,7 +618,7 @@ destroy_conntrack(struct nf_conntrack *nfct) struct nf_conn *ct = (struct nf_conn *)nfct; pr_debug("destroy_conntrack(%p)\n", ct); - WARN_ON(atomic_read(&nfct->use) != 0); + WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); @@ -742,7 +742,7 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) + if (!refcount_inc_not_zero(&ct->ct_general.use)) return; if (nf_ct_should_gc(ct)) @@ -810,7 +810,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, * in, try to obtain a reference and re-check tuple */ ct = nf_ct_tuplehash_to_ctrack(h); - if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { if (likely(nf_ct_key_equal(h, tuple, zone, net))) goto found; @@ -907,7 +907,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) smp_wmb(); /* The caller holds a reference to this object */ - atomic_set(&ct->ct_general.use, 2); + refcount_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); @@ -958,7 +958,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) { struct nf_conn_tstamp *tstamp; - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); ct->status |= IPS_CONFIRMED; /* set conntrack timestamp, if enabled. */ @@ -1351,7 +1351,7 @@ static unsigned int early_drop_list(struct net *net, nf_ct_is_dying(tmp)) continue; - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* kill only if still in same netns -- might have moved due to @@ -1469,7 +1469,7 @@ static void gc_worker(struct work_struct *work) continue; /* need to take reference to avoid possible races */ - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; if (gc_worker_skip_ct(tmp)) { @@ -1569,7 +1569,7 @@ __nf_conntrack_alloc(struct net *net, /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ - atomic_set(&ct->ct_general.use, 0); + refcount_set(&ct->ct_general.use, 0); return ct; out: atomic_dec(&cnet->count); @@ -1594,7 +1594,7 @@ void nf_conntrack_free(struct nf_conn *ct) /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU */ - WARN_ON(atomic_read(&ct->ct_general.use) != 0); + WARN_ON(refcount_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); @@ -1686,8 +1686,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (!exp) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - /* Now it is inserted into the unconfirmed list, bump refcount */ - nf_conntrack_get(&ct->ct_general); + /* Now it is inserted into the unconfirmed list, set refcount to 1. */ + refcount_set(&ct->ct_general.use, 1); nf_ct_add_to_unconfirmed_list(ct); local_bh_enable(); @@ -2300,7 +2300,7 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), return NULL; found: - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); spin_unlock(lockp); local_bh_enable(); return ct; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 1e89b595ecd0..96948e98ec53 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -203,12 +203,12 @@ nf_ct_find_expectation(struct net *net, * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * - * The atomic_inc_not_zero() check tells: If that fails, we + * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || - !atomic_inc_not_zero(&exp->master->ct_general.use))) + !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4f53d9480ed5..13e2e0390c77 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -508,7 +508,7 @@ nla_put_failure: static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { - if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) + if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use)))) goto nla_put_failure; return 0; @@ -1200,7 +1200,7 @@ restart: ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { if (i < ARRAY_SIZE(nf_ct_evict) && - atomic_inc_not_zero(&ct->ct_general.use)) + refcount_inc_not_zero(&ct->ct_general.use)) nf_ct_evict[i++] = ct; continue; } @@ -1748,7 +1748,7 @@ restart: NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); if (res < 0) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) + if (!refcount_inc_not_zero(&ct->ct_general.use)) continue; cb->args[0] = cpu; cb->args[1] = (unsigned long)ct; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 80f675d884b2..3e1afd10a9b6 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -303,7 +303,7 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; WARN_ON(!ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; if (nf_ct_should_gc(ct)) { @@ -370,7 +370,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); - seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); + seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index ed37bb9b4e58..b90eca7a2f22 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -48,7 +48,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) + !refcount_inc_not_zero(&ct->ct_general.use))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 3d6d49420db8..2dfc5dae0656 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -349,7 +349,6 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 99b1de14ff7e..518d96c8c247 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -259,7 +259,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(atomic_read(&ct->ct_general.use) == 1)) { + if (likely(refcount_read(&ct->ct_general.use) == 1)) { nf_ct_zone_add(ct, &zone); } else { /* previous skb got queued to userspace */ @@ -270,7 +270,6 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, } } - atomic_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } #endif @@ -375,7 +374,6 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } - atomic_set(&tmp->ct_general.use, 1); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 0a913ce07425..267757b0392a 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -24,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) return XT_CONTINUE; if (ct) { - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } else { nf_ct_set(skb, ct, IP_CT_UNTRACKED); @@ -201,7 +201,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 1b5eae57bc90..121664e52271 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1716,7 +1716,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, goto err_free_ct; __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index ab1810f2e660..3fa904cf8f27 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1228,7 +1228,6 @@ static int tcf_ct_fill_params(struct net *net, return -ENOMEM; } __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); - nf_conntrack_get(&tmpl->ct_general); p->tmpl = tmpl; return 0; From 3fce16493dc1aa2c9af3d7e7bd360dfe203a3e6a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:23 +0100 Subject: [PATCH 14/32] netfilter: core: move ip_ct_attach indirection to struct nf_ct_hook ip_ct_attach predates struct nf_ct_hook, we can place it there and remove the exported symbol. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 +- net/netfilter/core.c | 19 ++++++++----------- net/netfilter/nf_conntrack_core.c | 4 +--- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 3fda1a508733..e0e3f3355ab1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -440,7 +440,6 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include -extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, @@ -463,6 +462,7 @@ struct nf_ct_hook { void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); + void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); }; extern struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 6dec9cd395f1..dc806dc9fe28 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -673,25 +673,22 @@ struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) -/* This does not belong here, but locally generated errors need it if connection - tracking in use: without this, connection may not be in hash table, and hence - manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) - __rcu __read_mostly; -EXPORT_SYMBOL(ip_ct_attach); - struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_hook); +/* This does not belong here, but locally generated errors need it if connection + * tracking in use: without this, connection may not be in hash table, and hence + * manufactured ICMP or RST packets will not be associated with it. + */ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { - void (*attach)(struct sk_buff *, const struct sk_buff *); + const struct nf_ct_hook *ct_hook; if (skb->_nfct) { rcu_read_lock(); - attach = rcu_dereference(ip_ct_attach); - if (attach) - attach(new, skb); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ct_hook->attach(new, skb); rcu_read_unlock(); } } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 328d359fcabe..89f1e5f0090b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2455,7 +2455,6 @@ static int kill_all(struct nf_conn *i, void *data) void nf_conntrack_cleanup_start(void) { conntrack_gc_work.exiting = true; - RCU_INIT_POINTER(ip_ct_attach, NULL); } void nf_conntrack_cleanup_end(void) @@ -2774,12 +2773,11 @@ static struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = destroy_conntrack, .get_tuple_skb = nf_conntrack_get_tuple_skb, + .attach = nf_conntrack_attach, }; void nf_conntrack_init_end(void) { - /* For use by REJECT target */ - RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); } From 285c8a7a58158cb1805c97ff03875df2ba2ea1fe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:24 +0100 Subject: [PATCH 15/32] netfilter: make function op structures const No functional changes, these structures should be const. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 8 ++++---- net/netfilter/core.c | 10 +++++----- net/netfilter/nf_conntrack_core.c | 4 ++-- net/netfilter/nf_conntrack_netlink.c | 4 ++-- net/netfilter/nf_nat_core.c | 2 +- net/netfilter/nfnetlink_queue.c | 8 ++++---- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e0e3f3355ab1..15e71bfff726 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -381,13 +381,13 @@ struct nf_nat_hook { enum ip_conntrack_dir dir); }; -extern struct nf_nat_hook __rcu *nf_nat_hook; +extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) - struct nf_nat_hook *nat_hook; + const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); @@ -464,7 +464,7 @@ struct nf_ct_hook { const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); }; -extern struct nf_ct_hook __rcu *nf_ct_hook; +extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; @@ -479,7 +479,7 @@ struct nfnl_ct_hook { void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; -extern struct nfnl_ct_hook __rcu *nfnl_ct_hook; +extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; /** * nf_skb_duplicated - TEE target has sent a packet diff --git a/net/netfilter/core.c b/net/netfilter/core.c index dc806dc9fe28..354cb472f386 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -666,14 +666,14 @@ EXPORT_SYMBOL(nf_hook_slow_list); /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ -struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; +const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfnl_ct_hook); -struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; +const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) -struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; +const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_hook); /* This does not belong here, but locally generated errors need it if connection @@ -696,7 +696,7 @@ EXPORT_SYMBOL(nf_ct_attach); void nf_conntrack_destroy(struct nf_conntrack *nfct) { - struct nf_ct_hook *ct_hook; + const struct nf_ct_hook *ct_hook; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); @@ -709,7 +709,7 @@ EXPORT_SYMBOL(nf_conntrack_destroy); bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { - struct nf_ct_hook *ct_hook; + const struct nf_ct_hook *ct_hook; bool ret = false; rcu_read_lock(); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 89f1e5f0090b..cd3d07e418b5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2085,9 +2085,9 @@ static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { + const struct nf_nat_hook *nat_hook; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - struct nf_nat_hook *nat_hook; unsigned int status; int dataoff; u16 l3num; @@ -2769,7 +2769,7 @@ err_cachep: return ret; } -static struct nf_ct_hook nf_conntrack_hook = { +static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = destroy_conntrack, .get_tuple_skb = nf_conntrack_get_tuple_skb, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 13e2e0390c77..01cc69ab0b4e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1819,7 +1819,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, const struct nlattr *attr) __must_hold(RCU) { - struct nf_nat_hook *nat_hook; + const struct nf_nat_hook *nat_hook; int err; nat_hook = rcu_dereference(nf_nat_hook); @@ -2921,7 +2921,7 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff); } -static struct nfnl_ct_hook ctnetlink_glue_hook = { +static const struct nfnl_ct_hook ctnetlink_glue_hook = { .build_size = ctnetlink_glue_build_size, .build = ctnetlink_glue_build, .parse = ctnetlink_glue_parse, diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 3dd130487b5b..2d06a66899b2 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1167,7 +1167,7 @@ static struct pernet_operations nat_net_ops = { .size = sizeof(struct nat_net), }; -static struct nf_nat_hook nat_hook = { +static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 08771f47d469..862d8a3a0911 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -225,7 +225,7 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { - struct nf_ct_hook *ct_hook; + const struct nf_ct_hook *ct_hook; int err; if (verdict == NF_ACCEPT || @@ -388,7 +388,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info ctinfo = 0; - struct nfnl_ct_hook *nfnl_ct; + const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; char *secdata = NULL; u32 seclen = 0; @@ -1103,7 +1103,7 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb, return 0; } -static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, +static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[], struct nf_queue_entry *entry, @@ -1170,11 +1170,11 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); + const struct nfnl_ct_hook *nfnl_ct; struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; struct nf_queue_entry *entry; - struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; unsigned int verdict; int err; From 6ae7989c9af0d98ab64196f4f4c6f6499454bd23 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:25 +0100 Subject: [PATCH 16/32] netfilter: conntrack: avoid useless indirection during conntrack destruction nf_ct_put() results in a usesless indirection: nf_ct_put -> nf_conntrack_put -> nf_conntrack_destroy -> rcu readlock + indirect call of ct_hooks->destroy(). There are two _put helpers: nf_ct_put and nf_conntrack_put. The latter is what should be used in code that MUST NOT cause a linker dependency on the conntrack module (e.g. calls from core network stack). Everyone else should call nf_ct_put() instead. A followup patch will convert a few nf_conntrack_put() calls to nf_ct_put(), in particular from modules that already have a conntrack dependency such as act_ct or even nf_conntrack itself. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 2 ++ include/net/netfilter/nf_conntrack.h | 8 ++++++-- net/netfilter/nf_conntrack_core.c | 12 ++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index a03f7a80b9ab..2770db2fa080 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -29,6 +29,8 @@ struct nf_conntrack { }; void nf_conntrack_destroy(struct nf_conntrack *nfct); + +/* like nf_ct_put, but without module dependency on nf_conntrack */ static inline void nf_conntrack_put(struct nf_conntrack *nfct) { if (nfct && refcount_dec_and_test(&nfct->use)) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a4a14f3a5e38..8731d5bcb47d 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -76,6 +76,8 @@ struct nf_conn { * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, + * except that the latter uses internal indirection and does not + * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; @@ -170,11 +172,13 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) return (struct nf_conn *)(nfct & NFCT_PTRMASK); } +void nf_ct_destroy(struct nf_conntrack *nfct); + /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { - WARN_ON(!ct); - nf_conntrack_put(&ct->ct_general); + if (ct && refcount_dec_and_test(&ct->ct_general.use)) + nf_ct_destroy(&ct->ct_general); } /* Protocol module loading */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cd3d07e418b5..7a2063abae04 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -558,7 +558,7 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) -/* Released via destroy_conntrack() */ +/* Released via nf_ct_destroy() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) @@ -612,12 +612,11 @@ static void destroy_gre_conntrack(struct nf_conn *ct) #endif } -static void -destroy_conntrack(struct nf_conntrack *nfct) +void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("destroy_conntrack(%p)\n", ct); + pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { @@ -643,9 +642,10 @@ destroy_conntrack(struct nf_conntrack *nfct) if (ct->master) nf_ct_put(ct->master); - pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); + pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } +EXPORT_SYMBOL(nf_ct_destroy); static void nf_ct_delete_from_lists(struct nf_conn *ct) { @@ -2771,7 +2771,7 @@ err_cachep: static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, - .destroy = destroy_conntrack, + .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, }; From 408bdcfce8dfd6902f75fbcd3b99d8b24b506597 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:26 +0100 Subject: [PATCH 17/32] net: prefer nf_ct_put instead of nf_conntrack_put Its the same as nf_conntrack_put(), but without the need for an indirect call. The downside is a module dependency on nf_conntrack, but all of these already depend on conntrack anyway. Cc: Paul Blakey Cc: dev@openvswitch.org Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 4 ++-- net/openvswitch/conntrack.c | 14 ++++++++++---- net/sched/act_ct.c | 6 +++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7a2063abae04..c74933a6039f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -989,7 +989,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, nf_ct_acct_merge(ct, ctinfo, loser_ct); nf_ct_add_to_dying_list(loser_ct); - nf_conntrack_put(&loser_ct->ct_general); + nf_ct_put(loser_ct); nf_ct_set(skb, ct, ctinfo); NF_CT_STAT_INC(net, clash_resolve); @@ -1921,7 +1921,7 @@ repeat: /* Invalid: inverse of the return code tells * the netfilter core what to do */ pr_debug("nf_conntrack_in: Can't track with proto module\n"); - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); skb->_nfct = 0; NF_CT_STAT_INC_ATOMIC(state->net, invalid); if (ret == -NF_DROP) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 121664e52271..a921c5c00a1b 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -574,7 +574,7 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); nf_ct_delete(ct, 0, 0); - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); } } @@ -723,7 +723,7 @@ static bool skb_nfct_cached(struct net *net, if (nf_ct_is_confirmed(ct)) nf_ct_delete(ct, 0, 0); - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); nf_ct_set(skb, NULL, 0); return false; } @@ -967,7 +967,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, /* Associate skb with specified zone. */ if (tmpl) { - nf_conntrack_put(skb_nfct(skb)); + ct = nf_ct_get(skb, &ctinfo); + nf_ct_put(ct); nf_conntrack_get(&tmpl->ct_general); nf_ct_set(skb, tmpl, IP_CT_NEW); } @@ -1328,7 +1329,12 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) { - nf_conntrack_put(skb_nfct(skb)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + + nf_ct_put(ct); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); ovs_ct_fill_key(skb, key, false); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 3fa904cf8f27..9db291b45878 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -598,7 +598,7 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, if (nf_ct_is_confirmed(ct)) nf_ct_kill(ct); - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return false; @@ -763,7 +763,7 @@ static void tcf_ct_params_free(struct rcu_head *head) tcf_ct_flow_table_put(params); if (params->tmpl) - nf_conntrack_put(¶ms->tmpl->ct_general); + nf_ct_put(params->tmpl); kfree(params); } @@ -967,7 +967,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, qdisc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } From 6316136ec6e3dd1c302f7e7289a9ee46ecc610ae Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 15:46:16 +0100 Subject: [PATCH 18/32] netfilter: egress: avoid a lockdep splat include/linux/netfilter_netdev.h:97 suspicious rcu_dereference_check() usage! 2 locks held by sd-resolve/1100: 0: ..(rcu_read_lock_bh){1:3}, at: ip_finish_output2 1: ..(rcu_read_lock_bh){1:3}, at: __dev_queue_xmit __dev_queue_xmit+0 .. The helper has two callers, one uses rcu_read_lock, the other rcu_read_lock_bh(). Annotate the dereference to reflect this. Fixes: 42df6e1d221dd ("netfilter: Introduce egress hook") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_netdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index b71b57a83bb4..b4dd96e4dc8d 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -94,7 +94,7 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, return skb; #endif - e = rcu_dereference(dev->nf_hooks_egress); + e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held()); if (!e) return skb; From 37f319f37d9005693dff085bb72852eeebc803ef Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:13 +0100 Subject: [PATCH 19/32] netfilter: nft_connlimit: move stateful fields out of expression data In preparation for the rule blob representation. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_connlimit.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 7d0761fad37e..58dcafe8bf79 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,7 +14,7 @@ #include struct nft_connlimit { - struct nf_conncount_list list; + struct nf_conncount_list *list; u32 limit; bool invert; }; @@ -43,12 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) { + if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; return; } - count = priv->list.count; + count = priv->list->count; if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; @@ -76,7 +76,11 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - nf_conncount_list_init(&priv->list); + priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL); + if (!priv->list) + return -ENOMEM; + + nf_conncount_list_init(priv->list); priv->limit = limit; priv->invert = invert; @@ -87,7 +91,8 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx, struct nft_connlimit *priv) { nf_ct_netns_put(ctx->net, ctx->family); - nf_conncount_cache_free(&priv->list); + nf_conncount_cache_free(priv->list); + kfree(priv->list); } static int nft_connlimit_do_dump(struct sk_buff *skb, @@ -200,7 +205,11 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - nf_conncount_list_init(&priv_dst->list); + priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC); + if (priv_dst->list) + return -ENOMEM; + + nf_conncount_list_init(priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -212,7 +221,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, { struct nft_connlimit *priv = nft_expr_priv(expr); - nf_conncount_cache_free(&priv->list); + nf_conncount_cache_free(priv->list); + kfree(priv->list); } static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) @@ -221,7 +231,7 @@ static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) bool ret; local_bh_disable(); - ret = nf_conncount_gc_list(net, &priv->list); + ret = nf_conncount_gc_list(net, priv->list); local_bh_enable(); return ret; From 33a24de37e814572491bcb35f42c0de74ad67586 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:14 +0100 Subject: [PATCH 20/32] netfilter: nft_last: move stateful fields out of expression data In preparation for the rule blob representation. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_last.c | 69 +++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 304e33cbed9b..5ee33d0ccd4e 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -8,9 +8,13 @@ #include #include +struct nft_last { + unsigned long jiffies; + unsigned int set; +}; + struct nft_last_priv { - unsigned long last_jiffies; - unsigned int last_set; + struct nft_last *last; }; static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = { @@ -22,47 +26,55 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_last_priv *priv = nft_expr_priv(expr); + struct nft_last *last; u64 last_jiffies; - u32 last_set = 0; int err; - if (tb[NFTA_LAST_SET]) { - last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET])); - if (last_set == 1) - priv->last_set = 1; - } + last = kzalloc(sizeof(*last), GFP_KERNEL); + if (!last) + return -ENOMEM; - if (last_set && tb[NFTA_LAST_MSECS]) { + if (tb[NFTA_LAST_SET]) + last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET])); + + if (last->set && tb[NFTA_LAST_MSECS]) { err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies); if (err < 0) - return err; + goto err; - priv->last_jiffies = jiffies - (unsigned long)last_jiffies; + last->jiffies = jiffies - (unsigned long)last_jiffies; } + priv->last = last; return 0; +err: + kfree(last); + + return err; } static void nft_last_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_last_priv *priv = nft_expr_priv(expr); + struct nft_last *last = priv->last; - if (READ_ONCE(priv->last_jiffies) != jiffies) - WRITE_ONCE(priv->last_jiffies, jiffies); - if (READ_ONCE(priv->last_set) == 0) - WRITE_ONCE(priv->last_set, 1); + if (READ_ONCE(last->jiffies) != jiffies) + WRITE_ONCE(last->jiffies, jiffies); + if (READ_ONCE(last->set) == 0) + WRITE_ONCE(last->set, 1); } static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_last_priv *priv = nft_expr_priv(expr); - unsigned long last_jiffies = READ_ONCE(priv->last_jiffies); - u32 last_set = READ_ONCE(priv->last_set); + struct nft_last *last = priv->last; + unsigned long last_jiffies = READ_ONCE(last->jiffies); + u32 last_set = READ_ONCE(last->set); __be64 msecs; if (time_before(jiffies, last_jiffies)) { - WRITE_ONCE(priv->last_set, 0); + WRITE_ONCE(last->set, 0); last_set = 0; } @@ -81,11 +93,32 @@ nla_put_failure: return -1; } +static void nft_last_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + + kfree(priv->last); +} + +static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_last_priv *priv_dst = nft_expr_priv(dst); + + priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC); + if (priv_dst->last) + return -ENOMEM; + + return 0; +} + static const struct nft_expr_ops nft_last_ops = { .type = &nft_last_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)), .eval = nft_last_eval, .init = nft_last_init, + .destroy = nft_last_destroy, + .clone = nft_last_clone, .dump = nft_last_dump, }; From ed0a0c60f0e50fa52853620672af97edde3d3a03 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:15 +0100 Subject: [PATCH 21/32] netfilter: nft_quota: move stateful fields out of expression data In preparation for the rule blob representation. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_quota.c | 52 +++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index c4d1389f7185..0484aef74273 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -15,13 +15,13 @@ struct nft_quota { atomic64_t quota; unsigned long flags; - atomic64_t consumed; + atomic64_t *consumed; }; static inline bool nft_overquota(struct nft_quota *priv, const struct sk_buff *skb) { - return atomic64_add_return(skb->len, &priv->consumed) >= + return atomic64_add_return(skb->len, priv->consumed) >= atomic64_read(&priv->quota); } @@ -90,13 +90,23 @@ static int nft_quota_do_init(const struct nlattr * const tb[], return -EOPNOTSUPP; } + priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL); + if (!priv->consumed) + return -ENOMEM; + atomic64_set(&priv->quota, quota); priv->flags = flags; - atomic64_set(&priv->consumed, consumed); + atomic64_set(priv->consumed, consumed); return 0; } +static void nft_quota_do_destroy(const struct nft_ctx *ctx, + struct nft_quota *priv) +{ + kfree(priv->consumed); +} + static int nft_quota_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) @@ -128,7 +138,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, * that we see, don't go over the quota boundary in what we send to * userspace. */ - consumed = atomic64_read(&priv->consumed); + consumed = atomic64_read(priv->consumed); quota = atomic64_read(&priv->quota); if (consumed >= quota) { consumed_cap = quota; @@ -145,7 +155,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, goto nla_put_failure; if (reset) { - atomic64_sub(consumed, &priv->consumed); + atomic64_sub(consumed, priv->consumed); clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); } return 0; @@ -162,11 +172,20 @@ static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj, return nft_quota_do_dump(skb, priv, reset); } +static void nft_quota_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_quota *priv = nft_obj_data(obj); + + return nft_quota_do_destroy(ctx, priv); +} + static struct nft_object_type nft_quota_obj_type; static const struct nft_object_ops nft_quota_obj_ops = { .type = &nft_quota_obj_type, .size = sizeof(struct nft_quota), .init = nft_quota_obj_init, + .destroy = nft_quota_obj_destroy, .eval = nft_quota_obj_eval, .dump = nft_quota_obj_dump, .update = nft_quota_obj_update, @@ -205,12 +224,35 @@ static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) return nft_quota_do_dump(skb, priv, false); } +static void nft_quota_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + return nft_quota_do_destroy(ctx, priv); +} + +static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_quota *priv_dst = nft_expr_priv(dst); + + priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC); + if (priv_dst->consumed) + return -ENOMEM; + + atomic64_set(priv_dst->consumed, 0); + + return 0; +} + static struct nft_expr_type nft_quota_type; static const struct nft_expr_ops nft_quota_ops = { .type = &nft_quota_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_quota)), .eval = nft_quota_eval, .init = nft_quota_init, + .destroy = nft_quota_destroy, + .clone = nft_quota_clone, .dump = nft_quota_dump, }; From 567882eb3d441fef2aa42a75a9688a31979d29f5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:16 +0100 Subject: [PATCH 22/32] netfilter: nft_numgen: move stateful fields out of expression data In preparation for the rule blob representation. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_numgen.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 722cac1e90e0..1d378efd8823 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -18,7 +18,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); struct nft_ng_inc { u8 dreg; u32 modulus; - atomic_t counter; + atomic_t *counter; u32 offset; }; @@ -27,9 +27,9 @@ static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) u32 nval, oval; do { - oval = atomic_read(&priv->counter); + oval = atomic_read(priv->counter); nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; - } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); + } while (atomic_cmpxchg(priv->counter, oval, nval) != oval); return nval + priv->offset; } @@ -55,6 +55,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_ng_inc *priv = nft_expr_priv(expr); + int err; if (tb[NFTA_NG_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); @@ -66,10 +67,22 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - atomic_set(&priv->counter, priv->modulus - 1); + priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL); + if (!priv->counter) + return -ENOMEM; - return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, - NULL, NFT_DATA_VALUE, sizeof(u32)); + atomic_set(priv->counter, priv->modulus - 1); + + err = nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); + if (err < 0) + goto err; + + return 0; +err: + kfree(priv->counter); + + return err; } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, @@ -98,6 +111,14 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } +static void nft_ng_inc_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_ng_inc *priv = nft_expr_priv(expr); + + kfree(priv->counter); +} + struct nft_ng_random { u8 dreg; u32 modulus; @@ -157,6 +178,7 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), .eval = nft_ng_inc_eval, .init = nft_ng_inc_init, + .destroy = nft_ng_inc_destroy, .dump = nft_ng_inc_dump, }; From 369b6cb5d391750fc01ce951c2500281d2975705 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:17 +0100 Subject: [PATCH 23/32] netfilter: nft_limit: rename stateful structure From struct nft_limit to nft_limit_priv. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 104 +++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 82ec27bdf941..d6e0226b7603 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -14,7 +14,7 @@ #include #include -struct nft_limit { +struct nft_limit_priv { spinlock_t lock; u64 last; u64 tokens; @@ -25,33 +25,33 @@ struct nft_limit { bool invert; }; -static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) +static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) { u64 now, tokens; s64 delta; - spin_lock_bh(&limit->lock); + spin_lock_bh(&priv->lock); now = ktime_get_ns(); - tokens = limit->tokens + now - limit->last; - if (tokens > limit->tokens_max) - tokens = limit->tokens_max; + tokens = priv->tokens + now - priv->last; + if (tokens > priv->tokens_max) + tokens = priv->tokens_max; - limit->last = now; + priv->last = now; delta = tokens - cost; if (delta >= 0) { - limit->tokens = delta; - spin_unlock_bh(&limit->lock); - return limit->invert; + priv->tokens = delta; + spin_unlock_bh(&priv->lock); + return priv->invert; } - limit->tokens = tokens; - spin_unlock_bh(&limit->lock); - return !limit->invert; + priv->tokens = tokens; + spin_unlock_bh(&priv->lock); + return !priv->invert; } /* Use same default as in iptables. */ #define NFT_LIMIT_PKT_BURST_DEFAULT 5 -static int nft_limit_init(struct nft_limit *limit, +static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { u64 unit, tokens; @@ -60,58 +60,58 @@ static int nft_limit_init(struct nft_limit *limit, tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; - limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - limit->nsecs = unit * NSEC_PER_SEC; - if (limit->rate == 0 || limit->nsecs < unit) + priv->nsecs = unit * NSEC_PER_SEC; + if (priv->rate == 0 || priv->nsecs < unit) return -EOVERFLOW; if (tb[NFTA_LIMIT_BURST]) - limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); + priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); - if (pkts && limit->burst == 0) - limit->burst = NFT_LIMIT_PKT_BURST_DEFAULT; + if (pkts && priv->burst == 0) + priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; - if (limit->rate + limit->burst < limit->rate) + if (priv->rate + priv->burst < priv->rate) return -EOVERFLOW; if (pkts) { - tokens = div64_u64(limit->nsecs, limit->rate) * limit->burst; + tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst; } else { /* The token bucket size limits the number of tokens can be * accumulated. tokens_max specifies the bucket size. * tokens_max = unit * (rate + burst) / rate. */ - tokens = div64_u64(limit->nsecs * (limit->rate + limit->burst), - limit->rate); + tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst), + priv->rate); } - limit->tokens = tokens; - limit->tokens_max = limit->tokens; + priv->tokens = tokens; + priv->tokens_max = priv->tokens; if (tb[NFTA_LIMIT_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); if (flags & NFT_LIMIT_F_INV) - limit->invert = true; + priv->invert = true; } - limit->last = ktime_get_ns(); - spin_lock_init(&limit->lock); + priv->last = ktime_get_ns(); + spin_lock_init(&priv->lock); return 0; } -static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, +static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv, enum nft_limit_type type) { - u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0; - u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); + u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0; + u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC); - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate), + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate), NFTA_LIMIT_PAD) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), NFTA_LIMIT_PAD) || - nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || + nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) || nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) goto nla_put_failure; @@ -121,8 +121,8 @@ nla_put_failure: return -1; } -struct nft_limit_pkts { - struct nft_limit limit; +struct nft_limit_priv_pkts { + struct nft_limit_priv limit; u64 cost; }; @@ -130,7 +130,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit_pkts *priv = nft_expr_priv(expr); + struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; @@ -148,7 +148,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_limit_pkts *priv = nft_expr_priv(expr); + struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); int err; err = nft_limit_init(&priv->limit, tb, true); @@ -161,7 +161,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) { - const struct nft_limit_pkts *priv = nft_expr_priv(expr); + const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } @@ -169,7 +169,7 @@ static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) static struct nft_expr_type nft_limit_type; static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .eval = nft_limit_pkts_eval, .init = nft_limit_pkts_init, .dump = nft_limit_pkts_dump, @@ -179,7 +179,7 @@ static void nft_limit_bytes_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit *priv = nft_expr_priv(expr); + struct nft_limit_priv *priv = nft_expr_priv(expr); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) @@ -190,7 +190,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_limit *priv = nft_expr_priv(expr); + struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_init(priv, tb, false); } @@ -198,14 +198,14 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx, static int nft_limit_bytes_dump(struct sk_buff *skb, const struct nft_expr *expr) { - const struct nft_limit *priv = nft_expr_priv(expr); + const struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } static const struct nft_expr_ops nft_limit_bytes_ops = { .type = &nft_limit_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)), .eval = nft_limit_bytes_eval, .init = nft_limit_bytes_init, .dump = nft_limit_bytes_dump, @@ -240,7 +240,7 @@ static void nft_limit_obj_pkts_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit_pkts *priv = nft_obj_data(obj); + struct nft_limit_priv_pkts *priv = nft_obj_data(obj); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; @@ -250,7 +250,7 @@ static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { - struct nft_limit_pkts *priv = nft_obj_data(obj); + struct nft_limit_priv_pkts *priv = nft_obj_data(obj); int err; err = nft_limit_init(&priv->limit, tb, true); @@ -265,7 +265,7 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { - const struct nft_limit_pkts *priv = nft_obj_data(obj); + const struct nft_limit_priv_pkts *priv = nft_obj_data(obj); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } @@ -273,7 +273,7 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb, static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_pkts_ops = { .type = &nft_limit_obj_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .init = nft_limit_obj_pkts_init, .eval = nft_limit_obj_pkts_eval, .dump = nft_limit_obj_pkts_dump, @@ -283,7 +283,7 @@ static void nft_limit_obj_bytes_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit *priv = nft_obj_data(obj); + struct nft_limit_priv *priv = nft_obj_data(obj); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) @@ -294,7 +294,7 @@ static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { - struct nft_limit *priv = nft_obj_data(obj); + struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_init(priv, tb, false); } @@ -303,7 +303,7 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { - const struct nft_limit *priv = nft_obj_data(obj); + const struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } @@ -311,7 +311,7 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb, static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_bytes_ops = { .type = &nft_limit_obj_type, - .size = sizeof(struct nft_limit), + .size = sizeof(struct nft_limit_priv), .init = nft_limit_obj_bytes_init, .eval = nft_limit_obj_bytes_eval, .dump = nft_limit_obj_bytes_dump, From 3b9e2ea6c11bff72ac1d607f6b954e7666b47409 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:18 +0100 Subject: [PATCH 24/32] netfilter: nft_limit: move stateful fields out of expression data In preparation for the rule blob representation. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 94 ++++++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index d6e0226b7603..f04be5be73a0 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -14,10 +14,14 @@ #include #include -struct nft_limit_priv { +struct nft_limit { spinlock_t lock; u64 last; u64 tokens; +}; + +struct nft_limit_priv { + struct nft_limit *limit; u64 tokens_max; u64 rate; u64 nsecs; @@ -30,21 +34,21 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) u64 now, tokens; s64 delta; - spin_lock_bh(&priv->lock); + spin_lock_bh(&priv->limit->lock); now = ktime_get_ns(); - tokens = priv->tokens + now - priv->last; + tokens = priv->limit->tokens + now - priv->limit->last; if (tokens > priv->tokens_max) tokens = priv->tokens_max; - priv->last = now; + priv->limit->last = now; delta = tokens - cost; if (delta >= 0) { - priv->tokens = delta; - spin_unlock_bh(&priv->lock); + priv->limit->tokens = delta; + spin_unlock_bh(&priv->limit->lock); return priv->invert; } - priv->tokens = tokens; - spin_unlock_bh(&priv->lock); + priv->limit->tokens = tokens; + spin_unlock_bh(&priv->limit->lock); return !priv->invert; } @@ -86,8 +90,12 @@ static int nft_limit_init(struct nft_limit_priv *priv, priv->rate); } - priv->tokens = tokens; - priv->tokens_max = priv->tokens; + priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL); + if (!priv->limit) + return -ENOMEM; + + priv->limit->tokens = tokens; + priv->tokens_max = priv->limit->tokens; if (tb[NFTA_LIMIT_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); @@ -95,8 +103,8 @@ static int nft_limit_init(struct nft_limit_priv *priv, if (flags & NFT_LIMIT_F_INV) priv->invert = true; } - priv->last = ktime_get_ns(); - spin_lock_init(&priv->lock); + priv->limit->last = ktime_get_ns(); + spin_lock_init(&priv->limit->lock); return 0; } @@ -121,6 +129,32 @@ nla_put_failure: return -1; } +static void nft_limit_destroy(const struct nft_ctx *ctx, + const struct nft_limit_priv *priv) +{ + kfree(priv->limit); +} + +static int nft_limit_clone(struct nft_limit_priv *priv_dst, + const struct nft_limit_priv *priv_src) +{ + priv_dst->tokens_max = priv_src->tokens_max; + priv_dst->rate = priv_src->rate; + priv_dst->nsecs = priv_src->nsecs; + priv_dst->burst = priv_src->burst; + priv_dst->invert = priv_src->invert; + + priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC); + if (priv_dst->limit) + return -ENOMEM; + + spin_lock_init(&priv_dst->limit->lock); + priv_dst->limit->tokens = priv_src->tokens_max; + priv_dst->limit->last = ktime_get_ns(); + + return 0; +} + struct nft_limit_priv_pkts { struct nft_limit_priv limit; u64 cost; @@ -166,12 +200,30 @@ static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } +static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); + + nft_limit_destroy(ctx, &priv->limit); +} + +static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); + struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); + + return nft_limit_clone(&priv_dst->limit, &priv_src->limit); +} + static struct nft_expr_type nft_limit_type; static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .eval = nft_limit_pkts_eval, .init = nft_limit_pkts_init, + .destroy = nft_limit_pkts_destroy, + .clone = nft_limit_pkts_clone, .dump = nft_limit_pkts_dump, }; @@ -203,12 +255,30 @@ static int nft_limit_bytes_dump(struct sk_buff *skb, return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } +static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_limit_priv *priv = nft_expr_priv(expr); + + nft_limit_destroy(ctx, priv); +} + +static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_limit_priv *priv_dst = nft_expr_priv(dst); + struct nft_limit_priv *priv_src = nft_expr_priv(src); + + return nft_limit_clone(priv_dst, priv_src); +} + static const struct nft_expr_ops nft_limit_bytes_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)), .eval = nft_limit_bytes_eval, .init = nft_limit_bytes_init, .dump = nft_limit_bytes_dump, + .clone = nft_limit_bytes_clone, + .destroy = nft_limit_bytes_destroy, }; static const struct nft_expr_ops * From 2c865a8a28a10e9800a3dd07ca339d24563e3d65 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:19 +0100 Subject: [PATCH 25/32] netfilter: nf_tables: add rule blob layout This patch adds a blob layout per chain to represent the ruleset in the packet datapath. size (unsigned long) struct nft_rule_dp struct nft_expr ... struct nft_rule_dp struct nft_expr ... struct nft_rule_dp (is_last=1) The new structure nft_rule_dp represents the rule in a more compact way (smaller memory footprint) compared to the control-plane nft_rule structure. The ruleset blob is a read-only data structure. The first field contains the blob size, then the rules containing expressions. There is a trailing rule which is used by the tracing infrastructure which is equivalent to the NULL rule marker in the previous representation. The blob size field does not include the size of this trailing rule marker. The ruleset blob is generated from the commit path. This patch reuses the infrastructure available since 0cbc06b3faba ("netfilter: nf_tables: remove synchronize_rcu in commit phase") to build the array of rules per chain. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 22 ++++- net/netfilter/nf_tables_api.c | 149 ++++++++++++++++++++---------- net/netfilter/nf_tables_core.c | 41 +++++--- net/netfilter/nf_tables_trace.c | 2 +- 4 files changed, 147 insertions(+), 67 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a0d9e0b47ab8..5a046b01bdab 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -974,6 +974,20 @@ static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, #define NFT_CHAIN_POLICY_UNSET U8_MAX +struct nft_rule_dp { + u64 is_last:1, + dlen:12, + handle:42; /* for tracing */ + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_expr)))); +}; + +struct nft_rule_blob { + unsigned long size; + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_rule_dp)))); +}; + /** * struct nft_chain - nf_tables chain * @@ -987,8 +1001,8 @@ static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, * @name: name of the chain */ struct nft_chain { - struct nft_rule *__rcu *rules_gen_0; - struct nft_rule *__rcu *rules_gen_1; + struct nft_rule_blob __rcu *blob_gen_0; + struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; @@ -1003,7 +1017,7 @@ struct nft_chain { u8 *udata; /* Only used during control plane commit phase: */ - struct nft_rule **rules_next; + struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); @@ -1321,7 +1335,7 @@ struct nft_traceinfo { const struct nft_pktinfo *pkt; const struct nft_base_chain *basechain; const struct nft_chain *chain; - const struct nft_rule *rule; + const struct nft_rule_dp *rule; const struct nft_verdict *verdict; enum nft_trace_types type; bool packet_dumped; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c0851fec11d4..2317429ea35e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1747,16 +1747,16 @@ static void nft_chain_stats_replace(struct nft_trans *trans) static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) { - struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0); - struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1); + struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0); + struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1); if (g0 != g1) kvfree(g1); kvfree(g0); /* should be NULL either via abort or via successful commit */ - WARN_ON_ONCE(chain->rules_next); - kvfree(chain->rules_next); + WARN_ON_ONCE(chain->blob_next); + kvfree(chain->blob_next); } void nf_tables_chain_destroy(struct nft_ctx *ctx) @@ -2002,23 +2002,39 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) struct nft_rules_old { struct rcu_head h; - struct nft_rule **start; + struct nft_rule_blob *blob; }; -static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain, - unsigned int alloc) +static void nft_last_rule(struct nft_rule_blob *blob, const void *ptr) { - if (alloc > INT_MAX) + struct nft_rule_dp *prule; + + prule = (struct nft_rule_dp *)ptr; + prule->is_last = 1; + ptr += offsetof(struct nft_rule_dp, data); + /* blob size does not include the trailer rule */ +} + +static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size) +{ + struct nft_rule_blob *blob; + + /* size must include room for the last rule */ + if (size < offsetof(struct nft_rule_dp, data)) return NULL; - alloc += 1; /* NULL, ends rules */ - if (sizeof(struct nft_rule *) > INT_MAX / alloc) + size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rules_old); + if (size > INT_MAX) return NULL; - alloc *= sizeof(struct nft_rule *); - alloc += sizeof(struct nft_rules_old); + blob = kvmalloc(size, GFP_KERNEL); + if (!blob) + return NULL; - return kvmalloc(alloc, GFP_KERNEL); + blob->size = 0; + nft_last_rule(blob, blob->data); + + return blob; } static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, @@ -2091,9 +2107,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_stats __percpu *stats; struct net *net = ctx->net; char name[NFT_NAME_MAXLEN]; + struct nft_rule_blob *blob; struct nft_trans *trans; struct nft_chain *chain; - struct nft_rule **rules; + unsigned int data_size; int err; if (table->use == UINT_MAX) @@ -2178,15 +2195,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); } - rules = nf_tables_chain_alloc_rules(chain, 0); - if (!rules) { + data_size = offsetof(struct nft_rule_dp, data); /* last rule */ + blob = nf_tables_chain_alloc_rules(data_size); + if (!blob) { err = -ENOMEM; goto err_destroy_chain; } - *rules = NULL; - rcu_assign_pointer(chain->rules_gen_0, rules); - rcu_assign_pointer(chain->rules_gen_1, rules); + RCU_INIT_POINTER(chain->blob_gen_0, blob); + RCU_INIT_POINTER(chain->blob_gen_1, blob); err = nf_tables_register_hook(net, table, chain); if (err < 0) @@ -8241,32 +8258,72 @@ EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { + const struct nft_expr *expr, *last; + unsigned int size, data_size; + void *data, *data_boundary; + struct nft_rule_dp *prule; struct nft_rule *rule; - unsigned int alloc = 0; int i; /* already handled or inactive chain? */ - if (chain->rules_next || !nft_is_active_next(net, chain)) + if (chain->blob_next || !nft_is_active_next(net, chain)) return 0; rule = list_entry(&chain->rules, struct nft_rule, list); i = 0; list_for_each_entry_continue(rule, &chain->rules, list) { - if (nft_is_active_next(net, rule)) - alloc++; + if (nft_is_active_next(net, rule)) { + data_size += sizeof(*prule) + rule->dlen; + if (data_size > INT_MAX) + return -ENOMEM; + } } + data_size += offsetof(struct nft_rule_dp, data); /* last rule */ - chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc); - if (!chain->rules_next) + chain->blob_next = nf_tables_chain_alloc_rules(data_size); + if (!chain->blob_next) return -ENOMEM; + data = (void *)chain->blob_next->data; + data_boundary = data + data_size; + size = 0; + list_for_each_entry_continue(rule, &chain->rules, list) { - if (nft_is_active_next(net, rule)) - chain->rules_next[i++] = rule; + if (!nft_is_active_next(net, rule)) + continue; + + prule = (struct nft_rule_dp *)data; + data += offsetof(struct nft_rule_dp, data); + if (WARN_ON_ONCE(data > data_boundary)) + return -ENOMEM; + + nft_rule_for_each_expr(expr, last, rule) { + if (WARN_ON_ONCE(data + expr->ops->size > data_boundary)) + return -ENOMEM; + + memcpy(data + size, expr, expr->ops->size); + size += expr->ops->size; + } + if (WARN_ON_ONCE(size >= 1 << 12)) + return -ENOMEM; + + prule->handle = rule->handle; + prule->dlen = size; + prule->is_last = 0; + + data += size; + size = 0; + chain->blob_next->size += (unsigned long)(data - (void *)prule); } - chain->rules_next[i] = NULL; + prule = (struct nft_rule_dp *)data; + data += offsetof(struct nft_rule_dp, data); + if (WARN_ON_ONCE(data > data_boundary)) + return -ENOMEM; + + nft_last_rule(chain->blob_next, prule); + return 0; } @@ -8280,8 +8337,8 @@ static void nf_tables_commit_chain_prepare_cancel(struct net *net) if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { - kvfree(chain->rules_next); - chain->rules_next = NULL; + kvfree(chain->blob_next); + chain->blob_next = NULL; } } } @@ -8290,38 +8347,34 @@ static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h) { struct nft_rules_old *o = container_of(h, struct nft_rules_old, h); - kvfree(o->start); + kvfree(o->blob); } -static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules) +static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob) { - struct nft_rule **r = rules; struct nft_rules_old *old; - while (*r) - r++; - - r++; /* rcu_head is after end marker */ - old = (void *) r; - old->start = rules; + /* rcu_head is after end marker */ + old = (void *)blob + sizeof(*blob) + blob->size; + old->blob = blob; call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); } static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) { - struct nft_rule **g0, **g1; + struct nft_rule_blob *g0, *g1; bool next_genbit; next_genbit = nft_gencursor_next(net); - g0 = rcu_dereference_protected(chain->rules_gen_0, + g0 = rcu_dereference_protected(chain->blob_gen_0, lockdep_commit_lock_is_held(net)); - g1 = rcu_dereference_protected(chain->rules_gen_1, + g1 = rcu_dereference_protected(chain->blob_gen_1, lockdep_commit_lock_is_held(net)); /* No changes to this chain? */ - if (chain->rules_next == NULL) { + if (chain->blob_next == NULL) { /* chain had no change in last or next generation */ if (g0 == g1) return; @@ -8330,10 +8383,10 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) * one uses same rules as current generation. */ if (next_genbit) { - rcu_assign_pointer(chain->rules_gen_1, g0); + rcu_assign_pointer(chain->blob_gen_1, g0); nf_tables_commit_chain_free_rules_old(g1); } else { - rcu_assign_pointer(chain->rules_gen_0, g1); + rcu_assign_pointer(chain->blob_gen_0, g1); nf_tables_commit_chain_free_rules_old(g0); } @@ -8341,11 +8394,11 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) } if (next_genbit) - rcu_assign_pointer(chain->rules_gen_1, chain->rules_next); + rcu_assign_pointer(chain->blob_gen_1, chain->blob_next); else - rcu_assign_pointer(chain->rules_gen_0, chain->rules_next); + rcu_assign_pointer(chain->blob_gen_0, chain->blob_next); - chain->rules_next = NULL; + chain->blob_next = NULL; if (g0 == g1) return; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index df5eda7c7554..36e73f9828c5 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -38,7 +38,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info, static inline void nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, - const struct nft_rule *rule, + const struct nft_rule_dp *rule, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { @@ -88,7 +88,7 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, static inline void nft_trace_verdict(struct nft_traceinfo *info, const struct nft_chain *chain, - const struct nft_rule *rule, + const struct nft_rule_dp *rule, const struct nft_regs *regs) { if (static_branch_unlikely(&nft_trace_enabled)) { @@ -153,8 +153,9 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, } struct nft_jumpstack { - const struct nft_chain *chain; - struct nft_rule *const *rules; + const struct nft_chain *chain; + const struct nft_rule_dp *rule; + const struct nft_rule_dp *last_rule; }; static void expr_call_ops_eval(const struct nft_expr *expr, @@ -183,18 +184,28 @@ static void expr_call_ops_eval(const struct nft_expr *expr, expr->ops->eval(expr, regs, pkt); } +#define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0] +#define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size +#define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen] +#define nft_rule_next(rule) (void *)rule + sizeof(*rule) + rule->dlen + +#define nft_rule_dp_for_each_expr(expr, last, rule) \ + for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \ + (expr) != (last); \ + (expr) = nft_rule_expr_next(expr)) + unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv) { const struct nft_chain *chain = priv, *basechain = chain; + const struct nft_rule_dp *rule, *last_rule; const struct net *net = nft_net(pkt); - struct nft_rule *const *rules; - const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); + struct nft_rule_blob *blob; struct nft_traceinfo info; info.trace = false; @@ -202,16 +213,16 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) nft_trace_init(&info, pkt, ®s.verdict, basechain); do_chain: if (genbit) - rules = rcu_dereference(chain->rules_gen_1); + blob = rcu_dereference(chain->blob_gen_1); else - rules = rcu_dereference(chain->rules_gen_0); + blob = rcu_dereference(chain->blob_gen_0); + rule = (struct nft_rule_dp *)blob->data; + last_rule = (void *)blob->data + blob->size; next_rule: - rule = *rules; regs.verdict.code = NFT_CONTINUE; - for (; *rules ; rules++) { - rule = *rules; - nft_rule_for_each_expr(expr, last, rule) { + for (; rule < last_rule; rule = nft_rule_next(rule)) { + nft_rule_dp_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); else if (expr->ops == &nft_bitwise_fast_ops) @@ -251,7 +262,8 @@ next_rule: if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE)) return NF_DROP; jumpstack[stackptr].chain = chain; - jumpstack[stackptr].rules = rules + 1; + jumpstack[stackptr].rule = nft_rule_next(rule); + jumpstack[stackptr].last_rule = last_rule; stackptr++; fallthrough; case NFT_GOTO: @@ -267,7 +279,8 @@ next_rule: if (stackptr > 0) { stackptr--; chain = jumpstack[stackptr].chain; - rules = jumpstack[stackptr].rules; + rule = jumpstack[stackptr].rule; + last_rule = jumpstack[stackptr].last_rule; goto next_rule; } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 84a7dea46efa..5041725423c2 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -142,7 +142,7 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, static int nf_trace_fill_rule_info(struct sk_buff *nlskb, const struct nft_traceinfo *info) { - if (!info->rule) + if (!info->rule || info->rule->is_last) return 0; /* a continue verdict with ->type == RETURN means that this is From 642c8eff5c6099dfde386ca3906fa55dc98f9ade Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:20 +0100 Subject: [PATCH 26/32] netfilter: nf_tables: add NFT_REG32_NUM Add a definition including the maximum number of 32-bits registers that are used a scratchpad memory area to store data. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5a046b01bdab..515e5db97e01 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -105,6 +105,8 @@ struct nft_data { }; } __attribute__((aligned(__alignof__(u64)))); +#define NFT_REG32_NUM 20 + /** * struct nft_regs - nf_tables register set * @@ -115,7 +117,7 @@ struct nft_data { */ struct nft_regs { union { - u32 data[20]; + u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; From 12e4ecfa244be2f117ef5304d2d866b65e70bff3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:21 +0100 Subject: [PATCH 27/32] netfilter: nf_tables: add register tracking infrastructure This patch adds new infrastructure to skip redundant selector store operations on the same register to achieve a performance boost from the packet path. This is particularly noticeable in pure linear rulesets but it also helps in rulesets which are already heaving relying in maps to avoid ruleset linear inspection. The idea is to keep data of the most recurrent store operations on register to reuse them with cmp and lookup expressions. This infrastructure allows for dynamic ruleset updates since the ruleset blob reduction happens from the kernel. Userspace still needs to be updated to maximize register utilization to cooperate to improve register data reuse / reduce number of store on register operations. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 12 ++++++++++++ net/netfilter/nf_tables_api.c | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 515e5db97e01..1c37ce61daea 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -122,6 +122,16 @@ struct nft_regs { }; }; +struct nft_regs_track { + struct { + const struct nft_expr *selector; + const struct nft_expr *bitwise; + } regs[NFT_REG32_NUM]; + + const struct nft_expr *cur; + const struct nft_expr *last; +}; + /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit @@ -886,6 +896,8 @@ struct nft_expr_ops { int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); + bool (*reduce)(struct nft_regs_track *track, + const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2317429ea35e..83ce82212cbb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8259,6 +8259,7 @@ EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { const struct nft_expr *expr, *last; + struct nft_regs_track track = {}; unsigned int size, data_size; void *data, *data_boundary; struct nft_rule_dp *prule; @@ -8298,7 +8299,17 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha if (WARN_ON_ONCE(data > data_boundary)) return -ENOMEM; + size = 0; + track.last = last; nft_rule_for_each_expr(expr, last, rule) { + track.cur = expr; + + if (expr->ops->reduce && + expr->ops->reduce(&track, expr)) { + expr = track.cur; + continue; + } + if (WARN_ON_ONCE(data + expr->ops->size > data_boundary)) return -ENOMEM; From a7c176bf9f0e916f7544f6a00d898b0c90de1887 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:22 +0100 Subject: [PATCH 28/32] netfilter: nft_payload: track register operations Check if the destination register already contains the data that this payload store expression performs. This allows to skip this redundant operation. If the destination contains a different selector, update the register tracking information. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index f2e65df32a06..7a7c66e9a50e 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -210,6 +210,34 @@ nla_put_failure: return -1; } +static bool nft_payload_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct nft_payload *payload; + + if (!track->regs[priv->dreg].selector || + track->regs[priv->dreg].selector->ops != expr->ops) { + track->regs[priv->dreg].selector = expr; + track->regs[priv->dreg].bitwise = NULL; + return false; + } + + payload = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->base != payload->base || + priv->offset != payload->offset || + priv->len != payload->len) { + track->regs[priv->dreg].selector = expr; + track->regs[priv->dreg].bitwise = NULL; + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return false; +} + static bool nft_payload_offload_mask(struct nft_offload_reg *reg, u32 priv_len, u32 field_len) { @@ -513,6 +541,7 @@ static const struct nft_expr_ops nft_payload_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .reduce = nft_payload_reduce, .offload = nft_payload_offload, }; @@ -522,6 +551,7 @@ const struct nft_expr_ops nft_payload_fast_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .reduce = nft_payload_reduce, .offload = nft_payload_offload, }; From 9b17afb2c88bbadcc15b96f0275c426ae3d89a33 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:23 +0100 Subject: [PATCH 29/32] netfilter: nft_meta: track register operations Check if the destination register already contains the data that this meta store expression performs. This allows to skip this redundant operation. If the destination contains a different selector, update the register tracking information. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index fe91ff5f8fbe..430f40bc3cb4 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -750,12 +750,40 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_meta_get_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct nft_meta *meta; + + if (!track->regs[priv->dreg].selector || + track->regs[priv->dreg].selector->ops != expr->ops) { + track->regs[priv->dreg].selector = expr; + track->regs[priv->dreg].bitwise = NULL; + return false; + } + + meta = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->key != meta->key || + priv->dreg != meta->dreg) { + track->regs[priv->dreg].selector = expr; + track->regs[priv->dreg].bitwise = NULL; + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return false; +} + static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_get_eval, .init = nft_meta_get_init, .dump = nft_meta_get_dump, + .reduce = nft_meta_get_reduce, .validate = nft_meta_get_validate, .offload = nft_meta_get_offload, }; From be5650f8f47e8cffbbbcad08b71103685e971f20 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:24 +0100 Subject: [PATCH 30/32] netfilter: nft_bitwise: track register operations Check if the destination register already contains the data that this bitwise expression performs. This allows to skip this redundant operation. If the destination contains a different bitwise operation, cancel the register tracking information. If the destination contains no bitwise operation, update the register tracking information. Update the payload and meta expression to check if this bitwise operation has been already performed on the register. Hence, both the payload/meta and the bitwise expressions are reduced. There is also a special case: If source register != destination register and source register is not updated by a previous bitwise operation, then transfer selector from the source register to the destination register. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 + net/netfilter/nft_bitwise.c | 95 +++++++++++++++++++++++++++++++ net/netfilter/nft_meta.c | 2 +- net/netfilter/nft_payload.c | 2 +- 4 files changed, 99 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 1c37ce61daea..eaf55da9a205 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -358,6 +358,8 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr); +bool nft_expr_reduce_bitwise(struct nft_regs_track *track, + const struct nft_expr *expr); struct nft_set_ext; diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 47b0dba95054..7b727d3ebf9d 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -278,12 +278,52 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_bitwise_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + const struct nft_bitwise *bitwise; + + if (!track->regs[priv->sreg].selector) + return false; + + bitwise = nft_expr_priv(expr); + if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && + track->regs[priv->dreg].bitwise && + track->regs[priv->dreg].bitwise->ops == expr->ops && + priv->sreg == bitwise->sreg && + priv->dreg == bitwise->dreg && + priv->op == bitwise->op && + priv->len == bitwise->len && + !memcmp(&priv->mask, &bitwise->mask, sizeof(priv->mask)) && + !memcmp(&priv->xor, &bitwise->xor, sizeof(priv->xor)) && + !memcmp(&priv->data, &bitwise->data, sizeof(priv->data))) { + track->cur = expr; + return true; + } + + if (track->regs[priv->sreg].bitwise) { + track->regs[priv->dreg].selector = NULL; + track->regs[priv->dreg].bitwise = NULL; + return false; + } + + if (priv->sreg != priv->dreg) { + track->regs[priv->dreg].selector = + track->regs[priv->sreg].selector; + } + track->regs[priv->dreg].bitwise = expr; + + return false; +} + static const struct nft_expr_ops nft_bitwise_ops = { .type = &nft_bitwise_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), .eval = nft_bitwise_eval, .init = nft_bitwise_init, .dump = nft_bitwise_dump, + .reduce = nft_bitwise_reduce, .offload = nft_bitwise_offload, }; @@ -385,12 +425,49 @@ static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_bitwise_fast_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); + const struct nft_bitwise_fast_expr *bitwise; + + if (!track->regs[priv->sreg].selector) + return false; + + bitwise = nft_expr_priv(expr); + if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && + track->regs[priv->dreg].bitwise && + track->regs[priv->dreg].bitwise->ops == expr->ops && + priv->sreg == bitwise->sreg && + priv->dreg == bitwise->dreg && + priv->mask == bitwise->mask && + priv->xor == bitwise->xor) { + track->cur = expr; + return true; + } + + if (track->regs[priv->sreg].bitwise) { + track->regs[priv->dreg].selector = NULL; + track->regs[priv->dreg].bitwise = NULL; + return false; + } + + if (priv->sreg != priv->dreg) { + track->regs[priv->dreg].selector = + track->regs[priv->sreg].selector; + } + track->regs[priv->dreg].bitwise = expr; + + return false; +} + const struct nft_expr_ops nft_bitwise_fast_ops = { .type = &nft_bitwise_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)), .eval = NULL, /* inlined */ .init = nft_bitwise_fast_init, .dump = nft_bitwise_fast_dump, + .reduce = nft_bitwise_fast_reduce, .offload = nft_bitwise_fast_offload, }; @@ -427,3 +504,21 @@ struct nft_expr_type nft_bitwise_type __read_mostly = { .maxattr = NFTA_BITWISE_MAX, .owner = THIS_MODULE, }; + +bool nft_expr_reduce_bitwise(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_expr *last = track->last; + const struct nft_expr *next; + + if (expr == last) + return false; + + next = nft_expr_next(expr); + if (next->ops == &nft_bitwise_ops) + return nft_bitwise_reduce(track, next); + else if (next->ops == &nft_bitwise_fast_ops) + return nft_bitwise_fast_reduce(track, next); + + return false; +} diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 430f40bc3cb4..40fe48fcf9d0 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -774,7 +774,7 @@ static bool nft_meta_get_reduce(struct nft_regs_track *track, if (!track->regs[priv->dreg].bitwise) return true; - return false; + return nft_expr_reduce_bitwise(track, expr); } static const struct nft_expr_ops nft_meta_get_ops = { diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7a7c66e9a50e..f518bf634997 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -235,7 +235,7 @@ static bool nft_payload_reduce(struct nft_regs_track *track, if (!track->regs[priv->dreg].bitwise) return true; - return false; + return nft_expr_reduce_bitwise(track, expr); } static bool nft_payload_offload_mask(struct nft_offload_reg *reg, From cc003c7ee6094bca65435ca4bdbba8c98a7c859f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:25 +0100 Subject: [PATCH 31/32] netfilter: nft_payload: cancel register tracking after payload update The payload expression might mangle the packet, cancel register tracking since any payload data in the registers is stale. Finer grain register tracking cancellation by inspecting the payload base, offset and length on the register is also possible. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index f518bf634997..10f422caa76a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -798,12 +798,33 @@ nla_put_failure: return -1; } +static bool nft_payload_set_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + int i; + + for (i = 0; i < NFT_REG32_NUM; i++) { + if (!track->regs[i].selector) + continue; + + if (track->regs[i].selector->ops != &nft_payload_ops && + track->regs[i].selector->ops != &nft_payload_fast_ops) + continue; + + track->regs[i].selector = NULL; + track->regs[i].bitwise = NULL; + } + + return false; +} + static const struct nft_expr_ops nft_payload_set_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)), .eval = nft_payload_set_eval, .init = nft_payload_set_init, .dump = nft_payload_set_dump, + .reduce = nft_payload_set_reduce, }; static const struct nft_expr_ops * From 4a80e026981b791da3937470ace84796490c7796 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 9 Jan 2022 17:11:26 +0100 Subject: [PATCH 32/32] netfilter: nft_meta: cancel register tracking after meta update The meta expression might mangle the packet metadata, cancel register tracking since any metadata in the registers is stale. Finer grain register tracking cancellation by inspecting the meta type on the register is also possible. Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_meta_bridge.c | 20 ++++++++++++++++++++ net/netfilter/nft_meta.c | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 97805ec424c1..c1ef9cc89b78 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -100,6 +100,25 @@ static const struct nft_expr_ops nft_meta_bridge_get_ops = { .dump = nft_meta_get_dump, }; +static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + int i; + + for (i = 0; i < NFT_REG32_NUM; i++) { + if (!track->regs[i].selector) + continue; + + if (track->regs[i].selector->ops != &nft_meta_bridge_get_ops) + continue; + + track->regs[i].selector = NULL; + track->regs[i].bitwise = NULL; + } + + return false; +} + static const struct nft_expr_ops nft_meta_bridge_set_ops = { .type = &nft_meta_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -107,6 +126,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .reduce = nft_meta_bridge_set_reduce, .validate = nft_meta_set_validate, }; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 40fe48fcf9d0..5ab4df56c945 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -788,6 +788,25 @@ static const struct nft_expr_ops nft_meta_get_ops = { .offload = nft_meta_get_offload, }; +static bool nft_meta_set_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + int i; + + for (i = 0; i < NFT_REG32_NUM; i++) { + if (!track->regs[i].selector) + continue; + + if (track->regs[i].selector->ops != &nft_meta_get_ops) + continue; + + track->regs[i].selector = NULL; + track->regs[i].bitwise = NULL; + } + + return false; +} + static const struct nft_expr_ops nft_meta_set_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -795,6 +814,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .reduce = nft_meta_set_reduce, .validate = nft_meta_set_validate, };