From 92f3e96d642f5e05b9dc710c06fedc669f1b4f00 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 Feb 2023 11:34:27 +0100 Subject: [PATCH 01/53] netfilter: nf_tables: allow to fetch set elements when table has an owner NFT_MSG_GETSETELEM returns -EPERM when fetching set elements that belong to table that has an owner. This results in empty set/map listing from userspace. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8c09e4d12ac1..820c602d655e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5487,7 +5487,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask, NETLINK_CB(skb).portid); + genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); From ac4893980bbe79ce383daf9a0885666a30fe4c83 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 10 Feb 2023 15:17:30 +0800 Subject: [PATCH 02/53] netfilter: ctnetlink: fix possible refcount leak in ctnetlink_create_conntrack() nf_ct_put() needs to be called to put the refcount got by nf_conntrack_find_get() to avoid refcount leak when nf_conntrack_hash_check_insert() fails. Fixes: 7d367e06688d ("netfilter: ctnetlink: fix soft lockup when netlink adds new entries (v2)") Signed-off-by: Hangyu Hua Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1286ae7d4609..ca4d5bb1ea52 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2375,12 +2375,15 @@ ctnetlink_create_conntrack(struct net *net, err = nf_conntrack_hash_check_insert(ct); if (err < 0) - goto err2; + goto err3; rcu_read_unlock(); return ct; +err3: + if (ct->master) + nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: From e6d57e9ff0aec323717ee36fc9ea34ad89217151 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Feb 2023 17:23:59 +0100 Subject: [PATCH 03/53] netfilter: conntrack: fix rmmod double-free race nf_conntrack_hash_check_insert() callers free the ct entry directly, via nf_conntrack_free. This isn't safe anymore because nf_conntrack_hash_check_insert() might place the entry into the conntrack table and then delteted the entry again because it found that a conntrack extension has been removed at the same time. In this case, the just-added entry is removed again and an error is returned to the caller. Problem is that another cpu might have picked up this entry and incremented its reference count. This results in a use-after-free/double-free, once by the other cpu and once by the caller of nf_conntrack_hash_check_insert(). Fix this by making nf_conntrack_hash_check_insert() not fail anymore after the insertion, just like before the 'Fixes' commit. This is safe because a racing nf_ct_iterate() has to wait for us to release the conntrack hash spinlocks. While at it, make the function return -EAGAIN in the rmmod (genid changed) case, this makes nfnetlink replay the command (suggested by Pablo Neira). Fixes: c56716c69ce1 ("netfilter: extensions: introduce extension genid count") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_bpf.c | 1 - net/netfilter/nf_conntrack_core.c | 25 +++++++++++++++---------- net/netfilter/nf_conntrack_netlink.c | 3 --- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 24002bc61e07..e1af14e3b63c 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -381,7 +381,6 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; - nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { nf_conntrack_free(nfct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 496c4920505b..ead11a9c261f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -886,10 +886,8 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone = nf_ct_zone(ct); - if (!nf_ct_ext_valid_pre(ct->ext)) { - NF_CT_STAT_INC_ATOMIC(net, insert_failed); - return -ETIMEDOUT; - } + if (!nf_ct_ext_valid_pre(ct->ext)) + return -EAGAIN; local_bh_disable(); do { @@ -924,6 +922,19 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto chaintoolong; } + /* If genid has changed, we can't insert anymore because ct + * extensions could have stale pointers and nf_ct_iterate_destroy + * might have completed its table scan already. + * + * Increment of the ext genid right after this check is fine: + * nf_ct_iterate_destroy blocks until locks are released. + */ + if (!nf_ct_ext_valid_post(ct->ext)) { + err = -EAGAIN; + goto out; + } + + ct->status |= IPS_CONFIRMED; smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); @@ -932,12 +943,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) NF_CT_STAT_INC(net, insert); local_bh_enable(); - if (!nf_ct_ext_valid_post(ct->ext)) { - nf_ct_kill(ct); - NF_CT_STAT_INC_ATOMIC(net, drop); - return -ETIMEDOUT; - } - return 0; chaintoolong: NF_CT_STAT_INC(net, chaintoolong); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ca4d5bb1ea52..733bb56950c1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2316,9 +2316,6 @@ ctnetlink_create_conntrack(struct net *net, nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); - /* we must add conntrack extensions before confirmation. */ - ct->status |= IPS_CONFIRMED; - if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) From efb056e5f1f0036179b2f92c1c15f5ea7a891d70 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 16 Feb 2023 17:05:36 +0100 Subject: [PATCH 04/53] netfilter: ip6t_rpfilter: Fix regression with VRF interfaces When calling ip6_route_lookup() for the packet arriving on the VRF interface, the result is always the real (slave) interface. Expect this when validating the result. Fixes: acc641ab95b66 ("netfilter: rpfilter/fib: Populate flowic_l3mdev field") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_rpfilter.c | 4 ++- tools/testing/selftests/netfilter/rpath.sh | 32 ++++++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index a01d9b842bd0..67c87a88cde4 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -72,7 +72,9 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, goto out; } - if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE)) + if (rt->rt6i_idev->dev == dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex || + (flags & XT_RPFILTER_LOOSE)) ret = true; out: ip6_rt_put(rt); diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/netfilter/rpath.sh index f7311e66d219..5289c8447a41 100755 --- a/tools/testing/selftests/netfilter/rpath.sh +++ b/tools/testing/selftests/netfilter/rpath.sh @@ -62,10 +62,16 @@ ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad # firewall matches to test -[ -n "$iptables" ] && ip netns exec "$ns2" \ - "$iptables" -t raw -A PREROUTING -s 192.168.0.0/16 -m rpfilter -[ -n "$ip6tables" ] && ip netns exec "$ns2" \ - "$ip6tables" -t raw -A PREROUTING -s fec0::/16 -m rpfilter +[ -n "$iptables" ] && { + common='-t raw -A PREROUTING -s 192.168.0.0/16' + ip netns exec "$ns2" "$iptables" $common -m rpfilter + ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert +} +[ -n "$ip6tables" ] && { + common='-t raw -A PREROUTING -s fec0::/16' + ip netns exec "$ns2" "$ip6tables" $common -m rpfilter + ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert +} [ -n "$nft" ] && ip netns exec "$ns2" $nft -f - </dev/null } -testrun() { - # clear counters first +clear_counters() { [ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z [ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z if [ -n "$nft" ]; then @@ -111,6 +121,10 @@ testrun() { ip netns exec "$ns2" $nft -s list table inet t; ) | ip netns exec "$ns2" $nft -f - fi +} + +testrun() { + clear_counters # test 1: martian traffic should fail rpfilter matches netns_ping "$ns1" -I v0 192.168.42.1 && \ @@ -120,9 +134,13 @@ testrun() { ipt_zero_rule "$iptables" || die "iptables matched martian" ipt_zero_rule "$ip6tables" || die "ip6tables matched martian" + ipt_zero_reverse_rule "$iptables" && die "iptables not matched martian" + ipt_zero_reverse_rule "$ip6tables" && die "ip6tables not matched martian" nft_zero_rule ip || die "nft IPv4 matched martian" nft_zero_rule ip6 || die "nft IPv6 matched martian" + clear_counters + # test 2: rpfilter match should pass for regular traffic netns_ping "$ns1" 192.168.23.1 || \ die "regular ping 192.168.23.1 failed" @@ -131,6 +149,8 @@ testrun() { ipt_zero_rule "$iptables" && die "iptables match not effective" ipt_zero_rule "$ip6tables" && die "ip6tables match not effective" + ipt_zero_reverse_rule "$iptables" || die "iptables match over-effective" + ipt_zero_reverse_rule "$ip6tables" || die "ip6tables match over-effective" nft_zero_rule ip && die "nft IPv4 match not effective" nft_zero_rule ip6 && die "nft IPv6 match not effective" From e58a171d35e32e6e8c37cfe0e8a94406732a331f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Feb 2023 23:20:06 +0100 Subject: [PATCH 05/53] netfilter: ebtables: fix table blob use-after-free We are not allowed to return an error at this point. Looking at the code it looks like ret is always 0 at this point, but its not. t = find_table_lock(net, repl->name, &ret, &ebt_mutex); ... this can return a valid table, with ret != 0. This bug causes update of table->private with the new blob, but then frees the blob right away in the caller. Syzbot report: BUG: KASAN: vmalloc-out-of-bounds in __ebt_unregister_table+0xc00/0xcd0 net/bridge/netfilter/ebtables.c:1168 Read of size 4 at addr ffffc90005425000 by task kworker/u4:4/74 Workqueue: netns cleanup_net Call Trace: kasan_report+0xbf/0x1f0 mm/kasan/report.c:517 __ebt_unregister_table+0xc00/0xcd0 net/bridge/netfilter/ebtables.c:1168 ebt_unregister_table+0x35/0x40 net/bridge/netfilter/ebtables.c:1372 ops_exit_list+0xb0/0x170 net/core/net_namespace.c:169 cleanup_net+0x4ee/0xb10 net/core/net_namespace.c:613 ... ip(6)tables appears to be ok (ret should be 0 at this point) but make this more obvious. Fixes: c58dd2dd443c ("netfilter: Can't fail and free after table replacement") Reported-by: syzbot+f61594de72d6705aea03@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 3 +-- net/ipv6/netfilter/ip6_tables.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index ce5dfa3babd2..757ec46fc45a 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1090,7 +1090,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REPLACE, GFP_KERNEL); - return ret; + return 0; free_unlock: mutex_unlock(&ebt_mutex); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2ed7c58b471a..aae5fd51dfd7 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1045,7 +1045,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_counters *counters; struct ipt_entry *iter; - ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; @@ -1091,7 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); } vfree(counters); - return ret; + return 0; put_module: module_put(t->me); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 2d816277f2c5..ac902f7bca47 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1062,7 +1062,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_counters *counters; struct ip6t_entry *iter; - ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; @@ -1108,7 +1107,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); - return ret; + return 0; put_module: module_put(t->me); From 05c07c0c6cc8ec2278ace9871618c41f1365d1f5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 17 Feb 2023 18:22:57 -0500 Subject: [PATCH 06/53] netfilter: xt_length: use skb len to match in length_mt6 For IPv6 Jumbo packets, the ipv6_hdr(skb)->payload_len is always 0, and its real payload_len ( > 65535) is saved in hbh exthdr. With 0 length for the jumbo packets, it may mismatch. To fix this, we can just use skb->len instead of parsing exthdrs, as the hbh exthdr parsing has been done before coming to length_mt6 in ip6_rcv_core() and br_validate_ipv6() and also the packet has been trimmed according to the correct IPv6 (ext)hdr length there, and skb len is trustable in length_mt6(). Note that this patch is especially needed after the IPv6 BIG TCP was supported in kernel, which is using IPv6 Jumbo packets. Besides, to match the packets greater than 65535 more properly, a v1 revision of xt_length may be needed to extend "min, max" to u32 in the future, and for now the IPv6 Jumbo packets can be matched by: # ip6tables -m length ! --length 0:65535 Fixes: 7c4e983c4f3c ("net: allow gso_max_size to exceed 65536") Fixes: 0fe79f28bfaf ("net: allow gro_max_size to exceed 65536") Signed-off-by: Xin Long Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_length.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 1873da3a945a..9fbfad13176f 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -30,8 +30,7 @@ static bool length_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + - sizeof(struct ipv6hdr); + u32 pktlen = skb->len; return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } From fdf6491193e411087ae77bcbc6468e3e1cff99ed Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 20 Feb 2023 17:24:00 +0100 Subject: [PATCH 07/53] netfilter: ctnetlink: make event listener tracking global pernet tracking doesn't work correctly because other netns might have set NETLINK_LISTEN_ALL_NSID on its event socket. In this case its expected that events originating in other net namespaces are also received. Making pernet-tracking work while also honoring NETLINK_LISTEN_ALL_NSID requires much more intrusive changes both in netlink and nfnetlink, f.e. adding a 'setsockopt' callback that lets nfnetlink know that the event socket entered (or left) ALL_NSID mode. Move to global tracking instead: if there is an event socket anywhere on the system, all net namespaces which have conntrack enabled and use autobind mode will allocate the ecache extension. netlink_has_listeners() returns false only if the given group has no subscribers in any net namespace, the 'net' argument passed to nfnetlink_has_listeners is only used to derive the protocol (nfnetlink), it has no other effect. For proper NETLINK_LISTEN_ALL_NSID-aware pernet tracking of event listeners a new netlink_has_net_listeners() is also needed. Fixes: 90d1daa45849 ("netfilter: conntrack: add nf_conntrack_events autodetect mode") Reported-by: Bryce Kahle Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 5 +++++ include/net/netns/conntrack.h | 1 - net/netfilter/core.c | 3 +++ net/netfilter/nf_conntrack_ecache.c | 2 +- net/netfilter/nfnetlink.c | 9 +++++---- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index d8817d381c14..bef8db9d6c08 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -488,4 +488,9 @@ extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; */ DECLARE_PER_CPU(bool, nf_skb_duplicated); +/** + * Contains bitmask of ctnetlink event subscribers, if any. + * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. + */ +extern u8 nf_ctnetlink_has_listener; #endif /*__LINUX_NETFILTER_H*/ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index e1290c159184..1f463b3957c7 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -95,7 +95,6 @@ struct nf_ip_net { struct netns_ct { #ifdef CONFIG_NF_CONNTRACK_EVENTS - u8 ctnetlink_has_listener; bool ecache_dwork_pending; #endif u8 sysctl_log_invalid; /* Log invalid packets */ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5a6705a0e4ec..6e80f0f6149e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -669,6 +669,9 @@ const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) +u8 nf_ctnetlink_has_listener; +EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); + const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_hook); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 8698b3424646..69948e1d6974 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -309,7 +309,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp break; return true; case 2: /* autodetect: no event listener, don't allocate extension. */ - if (!READ_ONCE(net->ct.ctnetlink_has_listener)) + if (!READ_ONCE(nf_ctnetlink_has_listener)) return true; fallthrough; case 1: diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 6d18fb346868..81c7737c803a 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -29,6 +29,7 @@ #include #include +#include #include MODULE_LICENSE("GPL"); @@ -685,12 +686,12 @@ static void nfnetlink_bind_event(struct net *net, unsigned int group) group_bit = (1 << group); spin_lock(&nfnl_grp_active_lock); - v = READ_ONCE(net->ct.ctnetlink_has_listener); + v = READ_ONCE(nf_ctnetlink_has_listener); if ((v & group_bit) == 0) { v |= group_bit; /* read concurrently without nfnl_grp_active_lock held. */ - WRITE_ONCE(net->ct.ctnetlink_has_listener, v); + WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); @@ -744,12 +745,12 @@ static void nfnetlink_unbind(struct net *net, int group) spin_lock(&nfnl_grp_active_lock); if (!nfnetlink_has_listeners(net, group)) { - u8 v = READ_ONCE(net->ct.ctnetlink_has_listener); + u8 v = READ_ONCE(nf_ctnetlink_has_listener); v &= ~group_bit; /* read concurrently without nfnl_grp_active_lock held. */ - WRITE_ONCE(net->ct.ctnetlink_has_listener, v); + WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); #endif From 0af8c09c896810879387decfba8c942994bb61f5 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 13 Feb 2023 12:25:05 +0800 Subject: [PATCH 08/53] netfilter: x_tables: fix percpu counter block leak on error path when creating new netns Here is the stack where we allocate percpu counter block: +-< __alloc_percpu +-< xt_percpu_counter_alloc +-< find_check_entry # {arp,ip,ip6}_tables.c +-< translate_table And it can be leaked on this code path: +-> ip6t_register_table +-> translate_table # allocates percpu counter block +-> xt_register_table # fails there is no freeing of the counter block on xt_register_table fail. Note: xt_percpu_counter_free should be called to free it like we do in do_replace through cleanup_entry helper (or in __ip6t_unregister_table). Probability of hitting this error path is low AFAICS (xt_register_table can only return ENOMEM here, as it is not replacing anything, as we are creating new netns, and it is hard to imagine that all previous allocations succeeded and after that one in xt_register_table failed). But it's worth fixing even the rare leak. Fixes: 71ae0dff02d7 ("netfilter: xtables: use percpu rule counters") Signed-off-by: Pavel Tikhomirov Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 4 ++++ net/ipv4/netfilter/ip_tables.c | 4 ++++ net/ipv6/netfilter/ip6_tables.c | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index ffc0cab7cf18..2407066b0fec 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1525,6 +1525,10 @@ int arpt_register_table(struct net *net, new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { + struct arpt_entry *iter; + + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index aae5fd51dfd7..da5998011ab9 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1741,6 +1741,10 @@ int ipt_register_table(struct net *net, const struct xt_table *table, new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { + struct ipt_entry *iter; + + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index ac902f7bca47..0ce0ed17c758 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1750,6 +1750,10 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { + struct ip6t_entry *iter; + + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } From 7c15430822e71e90203d87e6d0cfe83fa058b0dc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 1 Feb 2023 12:32:01 -0600 Subject: [PATCH 09/53] wifi: ath11k: allow system suspend to survive ath11k When ath11k runs into internal errors upon suspend, it returns an error code to pci_pm_suspend, which aborts the entire system suspend. The driver should not abort system suspend, but should keep its internal errors to itself, and allow the system to suspend. Otherwise, a user can suspend a laptop by closing the lid and sealing it into a case, assuming that is will suspend, rather than heating up and draining the battery when in transit. In practice, the ath11k device seems to have plenty of transient errors, and subsequent suspend cycles after this failure often succeed. https://bugzilla.kernel.org/show_bug.cgi?id=216968 Fixes: d1b0c33850d29 ("ath11k: implement suspend for QCA6390 PCI devices") Signed-off-by: Len Brown Cc: stable@vger.kernel.org Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20230201183201.14431-1-len.brown@intel.com --- drivers/net/wireless/ath/ath11k/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/pci.c b/drivers/net/wireless/ath/ath11k/pci.c index 776362d151cb..0aeef2948ff5 100644 --- a/drivers/net/wireless/ath/ath11k/pci.c +++ b/drivers/net/wireless/ath/ath11k/pci.c @@ -981,7 +981,7 @@ static __maybe_unused int ath11k_pci_pm_suspend(struct device *dev) if (ret) ath11k_warn(ab, "failed to suspend core: %d\n", ret); - return ret; + return 0; } static __maybe_unused int ath11k_pci_pm_resume(struct device *dev) From 67d93ffc0f3c47094750bde6d62e7c5765dc47a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Tue, 21 Feb 2023 14:06:16 +0100 Subject: [PATCH 10/53] ptp: vclock: use mutex to fix "sleep on atomic" bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vclocks were using spinlocks to protect access to its timecounter and cyclecounter. Access to timecounter/cyclecounter is backed by the same driver callbacks that are used for non-virtual PHCs, but the usage of the spinlock imposes a new limitation that didn't exist previously: now they're called in atomic context so they mustn't sleep. Some drivers like sfc or ice may sleep on these callbacks, causing errors like "BUG: scheduling while atomic: ptp5/25223/0x00000002" Fix it replacing the vclock's spinlock by a mutex. It fix the mentioned bug and it doesn't introduce longer delays. I've tested synchronizing various different combinations of clocks: - vclock->sysclock - sysclock->vclock - vclock->vclock - hardware PHC in different NIC -> vclock - created 4 vclocks and launch 4 parallel phc2sys processes with lockdep enabled In all cases, comparing the delays reported by phc2sys, they are in the same range of values than before applying the patch. Link: https://lore.kernel.org/netdev/69d0ff33-bd32-6aa5-d36c-fbdc3c01337c@redhat.com/ Fixes: 5d43f951b1ac ("ptp: add ptp virtual clock driver framework") Reported-by: Yalin Li Suggested-by: Richard Cochran Tested-by: Miroslav Lichvar Signed-off-by: Íñigo Huguet Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20230221130616.21837-1-ihuguet@redhat.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_private.h | 2 +- drivers/ptp/ptp_vclock.c | 44 +++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 77918a2c6701..75f58fc468a7 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -66,7 +66,7 @@ struct ptp_vclock { struct hlist_node vclock_hash_node; struct cyclecounter cc; struct timecounter tc; - spinlock_t lock; /* protects tc/cc */ + struct mutex lock; /* protects tc/cc */ }; /* diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c index 1c0ed4805c0a..dcf752c9e045 100644 --- a/drivers/ptp/ptp_vclock.c +++ b/drivers/ptp/ptp_vclock.c @@ -43,16 +43,16 @@ static void ptp_vclock_hash_del(struct ptp_vclock *vclock) static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) { struct ptp_vclock *vclock = info_to_vclock(ptp); - unsigned long flags; s64 adj; adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT; adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR); - spin_lock_irqsave(&vclock->lock, flags); + if (mutex_lock_interruptible(&vclock->lock)) + return -EINTR; timecounter_read(&vclock->tc); vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj; - spin_unlock_irqrestore(&vclock->lock, flags); + mutex_unlock(&vclock->lock); return 0; } @@ -60,11 +60,11 @@ static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct ptp_vclock *vclock = info_to_vclock(ptp); - unsigned long flags; - spin_lock_irqsave(&vclock->lock, flags); + if (mutex_lock_interruptible(&vclock->lock)) + return -EINTR; timecounter_adjtime(&vclock->tc, delta); - spin_unlock_irqrestore(&vclock->lock, flags); + mutex_unlock(&vclock->lock); return 0; } @@ -73,12 +73,12 @@ static int ptp_vclock_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); - unsigned long flags; u64 ns; - spin_lock_irqsave(&vclock->lock, flags); + if (mutex_lock_interruptible(&vclock->lock)) + return -EINTR; ns = timecounter_read(&vclock->tc); - spin_unlock_irqrestore(&vclock->lock, flags); + mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; @@ -91,7 +91,6 @@ static int ptp_vclock_gettimex(struct ptp_clock_info *ptp, struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; struct timespec64 pts; - unsigned long flags; int err; u64 ns; @@ -99,9 +98,10 @@ static int ptp_vclock_gettimex(struct ptp_clock_info *ptp, if (err) return err; - spin_lock_irqsave(&vclock->lock, flags); + if (mutex_lock_interruptible(&vclock->lock)) + return -EINTR; ns = timecounter_cyc2time(&vclock->tc, timespec64_to_ns(&pts)); - spin_unlock_irqrestore(&vclock->lock, flags); + mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); @@ -113,11 +113,11 @@ static int ptp_vclock_settime(struct ptp_clock_info *ptp, { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns = timespec64_to_ns(ts); - unsigned long flags; - spin_lock_irqsave(&vclock->lock, flags); + if (mutex_lock_interruptible(&vclock->lock)) + return -EINTR; timecounter_init(&vclock->tc, &vclock->cc, ns); - spin_unlock_irqrestore(&vclock->lock, flags); + mutex_unlock(&vclock->lock); return 0; } @@ -127,7 +127,6 @@ static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp, { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; - unsigned long flags; int err; u64 ns; @@ -135,9 +134,10 @@ static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp, if (err) return err; - spin_lock_irqsave(&vclock->lock, flags); + if (mutex_lock_interruptible(&vclock->lock)) + return -EINTR; ns = timecounter_cyc2time(&vclock->tc, ktime_to_ns(xtstamp->device)); - spin_unlock_irqrestore(&vclock->lock, flags); + mutex_unlock(&vclock->lock); xtstamp->device = ns_to_ktime(ns); @@ -205,7 +205,7 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) INIT_HLIST_NODE(&vclock->vclock_hash_node); - spin_lock_init(&vclock->lock); + mutex_init(&vclock->lock); vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev); if (IS_ERR_OR_NULL(vclock->clock)) { @@ -269,7 +269,6 @@ ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { unsigned int hash = vclock_index % HASH_SIZE(vclock_hash); struct ptp_vclock *vclock; - unsigned long flags; u64 ns; u64 vclock_ns = 0; @@ -281,9 +280,10 @@ ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) if (vclock->clock->index != vclock_index) continue; - spin_lock_irqsave(&vclock->lock, flags); + if (mutex_lock_interruptible(&vclock->lock)) + break; vclock_ns = timecounter_cyc2time(&vclock->tc, ns); - spin_unlock_irqrestore(&vclock->lock, flags); + mutex_unlock(&vclock->lock); break; } From 4cc59f386991ec9374cb4bc83dbe1c0b5a95033f Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Wed, 22 Feb 2023 16:36:28 +0800 Subject: [PATCH 11/53] ipv6: Add lwtunnel encap size of all siblings in nexthop calculation In function rt6_nlmsg_size(), the length of nexthop is calculated by multipling the nexthop length of fib6_info and the number of siblings. However if the fib6_info has no lwtunnel but the siblings have lwtunnels, the nexthop length is less than it should be, and it will trigger a warning in inet6_rt_notify() as follows: WARNING: CPU: 0 PID: 6082 at net/ipv6/route.c:6180 inet6_rt_notify+0x120/0x130 ...... Call Trace: fib6_add_rt2node+0x685/0xa30 fib6_add+0x96/0x1b0 ip6_route_add+0x50/0xd0 inet6_rtm_newroute+0x97/0xa0 rtnetlink_rcv_msg+0x156/0x3d0 netlink_rcv_skb+0x5a/0x110 netlink_unicast+0x246/0x350 netlink_sendmsg+0x250/0x4c0 sock_sendmsg+0x66/0x70 ___sys_sendmsg+0x7c/0xd0 __sys_sendmsg+0x5d/0xb0 do_syscall_64+0x3f/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc This bug can be reproduced by script: ip -6 addr add 2002::2/64 dev ens2 ip -6 route add 100::/64 via 2002::1 dev ens2 metric 100 for i in 10 20 30 40 50 60 70; do ip link add link ens2 name ipv_$i type ipvlan ip -6 addr add 2002::$i/64 dev ipv_$i ifconfig ipv_$i up done for i in 10 20 30 40 50 60; do ip -6 route append 100::/64 encap ip6 dst 2002::$i via 2002::1 dev ipv_$i metric 100 done ip -6 route append 100::/64 via 2002::1 dev ipv_70 metric 100 This patch fixes it by adding nexthop_len of every siblings using rt6_nh_nlmsg_size(). Fixes: beb1afac518d ("net: ipv6: Add support to dump multipath routes via RTA_MULTIPATH attribute") Signed-off-by: Lu Wei Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230222083629.335683-2-luwei32@huawei.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c180c2ef17c5..0fdb03df2287 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5533,16 +5533,17 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i) nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); } else { + struct fib6_info *sibling, *next_sibling; struct fib6_nh *nh = f6i->fib6_nh; nexthop_len = 0; if (f6i->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(nh->fib_nh_lws); + rt6_nh_nlmsg_size(nh, &nexthop_len); - nexthop_len *= f6i->fib6_nsiblings; + list_for_each_entry_safe(sibling, next_sibling, + &f6i->fib6_siblings, fib6_siblings) { + rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); + } } nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } From 44bd0394fe10903757da0863e0cf62c2d9846ea6 Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Wed, 22 Feb 2023 16:36:29 +0800 Subject: [PATCH 12/53] selftests: fib_tests: Add test cases for IPv4/IPv6 in route notify Add tests to check whether the total fib info length is calculated corretly in route notify process. Signed-off-by: Lu Wei Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230222083629.335683-3-luwei32@huawei.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/fib_tests.sh | 96 +++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 70ea8798b1f6..7da8ec838c63 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -9,7 +9,7 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh" +TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh" VERBOSE=0 PAUSE_ON_FAIL=no @@ -655,6 +655,98 @@ fib_nexthop_test() cleanup } +fib6_notify_test() +{ + setup + + echo + echo "Fib6 info length calculation in route notify test" + set -e + + for i in 10 20 30 40 50 60 70; + do + $IP link add dummy_$i type dummy + $IP link set dev dummy_$i up + $IP -6 address add 2001:$i::1/64 dev dummy_$i + done + + $NS_EXEC ip monitor route &> errors.txt & + sleep 2 + + $IP -6 route add 2001::/64 \ + nexthop via 2001:10::2 dev dummy_10 \ + nexthop encap ip6 dst 2002::20 via 2001:20::2 dev dummy_20 \ + nexthop encap ip6 dst 2002::30 via 2001:30::2 dev dummy_30 \ + nexthop encap ip6 dst 2002::40 via 2001:40::2 dev dummy_40 \ + nexthop encap ip6 dst 2002::50 via 2001:50::2 dev dummy_50 \ + nexthop encap ip6 dst 2002::60 via 2001:60::2 dev dummy_60 \ + nexthop encap ip6 dst 2002::70 via 2001:70::2 dev dummy_70 + + set +e + + err=`cat errors.txt |grep "Message too long"` + if [ -z "$err" ];then + ret=0 + else + ret=1 + fi + + log_test $ret 0 "ipv6 route add notify" + + { kill %% && wait %%; } 2>/dev/null + + #rm errors.txt + + cleanup &> /dev/null +} + + +fib_notify_test() +{ + setup + + echo + echo "Fib4 info length calculation in route notify test" + + set -e + + for i in 10 20 30 40 50 60 70; + do + $IP link add dummy_$i type dummy + $IP link set dev dummy_$i up + $IP address add 20.20.$i.2/24 dev dummy_$i + done + + $NS_EXEC ip monitor route &> errors.txt & + sleep 2 + + $IP route add 10.0.0.0/24 \ + nexthop via 20.20.10.1 dev dummy_10 \ + nexthop encap ip dst 192.168.10.20 via 20.20.20.1 dev dummy_20 \ + nexthop encap ip dst 192.168.10.30 via 20.20.30.1 dev dummy_30 \ + nexthop encap ip dst 192.168.10.40 via 20.20.40.1 dev dummy_40 \ + nexthop encap ip dst 192.168.10.50 via 20.20.50.1 dev dummy_50 \ + nexthop encap ip dst 192.168.10.60 via 20.20.60.1 dev dummy_60 \ + nexthop encap ip dst 192.168.10.70 via 20.20.70.1 dev dummy_70 + + set +e + + err=`cat errors.txt |grep "Message too long"` + if [ -z "$err" ];then + ret=0 + else + ret=1 + fi + + log_test $ret 0 "ipv4 route add notify" + + { kill %% && wait %%; } 2>/dev/null + + rm errors.txt + + cleanup &> /dev/null +} + fib_suppress_test() { echo @@ -2111,6 +2203,8 @@ do fib_carrier_test|carrier) fib_carrier_test;; fib_rp_filter_test|rp_filter) fib_rp_filter_test;; fib_nexthop_test|nexthop) fib_nexthop_test;; + fib_notify_test|ipv4_notify) fib_notify_test;; + fib6_notify_test|ipv6_notify) fib6_notify_test;; fib_suppress_test|suppress) fib_suppress_test;; ipv6_route_test|ipv6_rt) ipv6_route_test;; ipv4_route_test|ipv4_rt) ipv4_route_test;; From c749e3f82a15e10a798bb55f60368ee102c793cb Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 22 Feb 2023 11:06:40 +0200 Subject: [PATCH 13/53] net/mlx5: Fix memory leak in IPsec RoCE creation During IPsec RoCE TX creation a struct for the flow group creation is allocated, but never freed. Free that struct once it is no longer in use. Fixes: 22551e77e550 ("net/mlx5: Configure IPsec steering for egress RoCEv2 traffic") Signed-off-by: Patrisious Haddad Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/a69739482cca7176d3a466f87bbf5af1250b09bb.1677056384.git.leon@kernel.org Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c index 2c53589b765d..6e3f178d6f84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c @@ -162,7 +162,7 @@ int mlx5_ipsec_fs_roce_tx_create(struct mlx5_core_dev *mdev, if (IS_ERR(ft)) { err = PTR_ERR(ft); mlx5_core_err(mdev, "Fail to create RoCE IPsec tx ft err=%d\n", err); - return err; + goto free_in; } roce->ft = ft; @@ -174,22 +174,25 @@ int mlx5_ipsec_fs_roce_tx_create(struct mlx5_core_dev *mdev, if (IS_ERR(g)) { err = PTR_ERR(g); mlx5_core_err(mdev, "Fail to create RoCE IPsec tx group err=%d\n", err); - goto fail; + goto destroy_table; } roce->g = g; err = ipsec_fs_roce_tx_rule_setup(mdev, roce, pol_ft); if (err) { mlx5_core_err(mdev, "Fail to create RoCE IPsec tx rules err=%d\n", err); - goto rule_fail; + goto destroy_group; } + kvfree(in); return 0; -rule_fail: +destroy_group: mlx5_destroy_flow_group(roce->g); -fail: +destroy_table: mlx5_destroy_flow_table(ft); +free_in: + kvfree(in); return err; } From e209519b623391465c0b37caca89bf0ffff91f53 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Feb 2023 06:50:40 +0100 Subject: [PATCH 14/53] net: phy: c45: use "supported_eee" instead of supported for access validation Make sure we use proper variable to validate access to potentially not supported registers. Otherwise we will get false read/write errors. Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202302211644.c12d19de-yujie.liu@intel.com Fixes: 022c3f87f88e ("net: phy: add genphy_c45_ethtool_get/set_eee() support") Signed-off-by: Oleksij Rempel Reviewed-by: Russell King (Oracle) Signed-off-by: Paolo Abeni --- drivers/net/phy/phy-c45.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index f9b128cecc3f..f23cce2c5199 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -674,7 +674,7 @@ int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv) { int val, changed; - if (linkmode_intersects(phydev->supported, PHY_EEE_CAP1_FEATURES)) { + if (linkmode_intersects(phydev->supported_eee, PHY_EEE_CAP1_FEATURES)) { val = linkmode_to_mii_eee_cap1_t(adv); /* In eee_broken_modes are stored MDIO_AN_EEE_ADV specific raw @@ -726,7 +726,7 @@ static int genphy_c45_read_eee_adv(struct phy_device *phydev, { int val; - if (linkmode_intersects(phydev->supported, PHY_EEE_CAP1_FEATURES)) { + if (linkmode_intersects(phydev->supported_eee, PHY_EEE_CAP1_FEATURES)) { /* IEEE 802.3-2018 45.2.7.13 EEE advertisement 1 * (Register 7.60) */ @@ -762,7 +762,7 @@ static int genphy_c45_read_eee_lpa(struct phy_device *phydev, { int val; - if (linkmode_intersects(phydev->supported, PHY_EEE_CAP1_FEATURES)) { + if (linkmode_intersects(phydev->supported_eee, PHY_EEE_CAP1_FEATURES)) { /* IEEE 802.3-2018 45.2.7.14 EEE link partner ability 1 * (Register 7.61) */ From b6478b8c93304fa0483e4657779b44634a1711c7 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Feb 2023 06:50:41 +0100 Subject: [PATCH 15/53] net: phy: c45: add genphy_c45_an_config_eee_aneg() function Add new genphy_c45_an_config_eee_aneg() function and replace some of genphy_c45_write_eee_adv() calls. This will be needed by the next patch. Signed-off-by: Oleksij Rempel Reviewed-by: Russell King (Oracle) Signed-off-by: Paolo Abeni --- drivers/net/phy/phy-c45.c | 11 ++++++++++- drivers/net/phy/phy_device.c | 2 +- include/linux/phy.h | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index f23cce2c5199..784868e818a7 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -262,7 +262,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev) linkmode_and(phydev->advertising, phydev->advertising, phydev->supported); - ret = genphy_c45_write_eee_adv(phydev, phydev->supported_eee); + ret = genphy_c45_an_config_eee_aneg(phydev); if (ret < 0) return ret; else if (ret) @@ -858,6 +858,15 @@ int genphy_c45_read_eee_abilities(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(genphy_c45_read_eee_abilities); +/** + * genphy_c45_an_config_eee_aneg - configure EEE advertisement + * @phydev: target phy_device struct + */ +int genphy_c45_an_config_eee_aneg(struct phy_device *phydev) +{ + return genphy_c45_write_eee_adv(phydev, phydev->supported_eee); +} + /** * genphy_c45_pma_read_abilities - read supported link modes from PMA * @phydev: target phy_device struct diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 71becceb8764..570a5803f9c2 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2231,7 +2231,7 @@ int __genphy_config_aneg(struct phy_device *phydev, bool changed) { int err; - err = genphy_c45_write_eee_adv(phydev, phydev->supported_eee); + err = genphy_c45_an_config_eee_aneg(phydev); if (err < 0) return err; else if (err) diff --git a/include/linux/phy.h b/include/linux/phy.h index 727bff531a14..19d83e112beb 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1765,6 +1765,7 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev, int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv); +int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; From 3eeca4e199cee2066c65b872391cecee5cbbbb81 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Feb 2023 06:50:42 +0100 Subject: [PATCH 16/53] net: phy: do not force EEE support With following patches: commit 9b01c885be36 ("net: phy: c22: migrate to genphy_c45_write_eee_adv()") commit 5827b168125d ("net: phy: c45: migrate to genphy_c45_write_eee_adv()") we set the advertisement to potentially supported values. This behavior may introduce new regressions on systems where EEE was disabled by default (BIOS or boot loader configuration or by other ways.) At same time, with this patches, we would overwrite EEE advertisement configuration made over ethtool. To avoid this issues, we need to cache initial and ethtool advertisement configuration and store it for later use. Fixes: 9b01c885be36 ("net: phy: c22: migrate to genphy_c45_write_eee_adv()") Fixes: 5827b168125d ("net: phy: c45: migrate to genphy_c45_write_eee_adv()") Fixes: 022c3f87f88e ("net: phy: add genphy_c45_ethtool_get/set_eee() support") Signed-off-by: Oleksij Rempel Reviewed-by: Russell King (Oracle) Signed-off-by: Paolo Abeni --- drivers/net/phy/phy-c45.c | 24 +++++++++++++++++------- drivers/net/phy/phy_device.c | 19 +++++++++++++++++++ include/linux/phy.h | 5 +++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 784868e818a7..8717c122e2f3 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -721,8 +721,7 @@ int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv) * @phydev: target phy_device struct * @adv: the linkmode advertisement status */ -static int genphy_c45_read_eee_adv(struct phy_device *phydev, - unsigned long *adv) +int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv) { int val; @@ -864,7 +863,13 @@ EXPORT_SYMBOL_GPL(genphy_c45_read_eee_abilities); */ int genphy_c45_an_config_eee_aneg(struct phy_device *phydev) { - return genphy_c45_write_eee_adv(phydev, phydev->supported_eee); + if (!phydev->eee_enabled) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(adv) = {}; + + return genphy_c45_write_eee_adv(phydev, adv); + } + + return genphy_c45_write_eee_adv(phydev, phydev->advertising_eee); } /** @@ -1430,17 +1435,22 @@ EXPORT_SYMBOL(genphy_c45_ethtool_get_eee); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(adv) = {}; int ret; if (data->eee_enabled) { if (data->advertised) - adv[0] = data->advertised; + ethtool_convert_legacy_u32_to_link_mode(phydev->advertising_eee, + data->advertised); else - linkmode_copy(adv, phydev->supported_eee); + linkmode_copy(phydev->advertising_eee, + phydev->supported_eee); + + phydev->eee_enabled = true; + } else { + phydev->eee_enabled = false; } - ret = genphy_c45_write_eee_adv(phydev, adv); + ret = genphy_c45_an_config_eee_aneg(phydev); if (ret < 0) return ret; if (ret > 0) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 570a5803f9c2..3f8a64fb9d71 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3141,6 +3141,25 @@ static int phy_probe(struct device *dev) of_set_phy_supported(phydev); phy_advertise_supported(phydev); + /* Get PHY default EEE advertising modes and handle them as potentially + * safe initial configuration. + */ + err = genphy_c45_read_eee_adv(phydev, phydev->advertising_eee); + if (err) + return err; + + /* There is no "enabled" flag. If PHY is advertising, assume it is + * kind of enabled. + */ + phydev->eee_enabled = !linkmode_empty(phydev->advertising_eee); + + /* Some PHYs may advertise, by default, not support EEE modes. So, + * we need to clean them. + */ + if (phydev->eee_enabled) + linkmode_and(phydev->advertising_eee, phydev->supported_eee, + phydev->advertising_eee); + /* Get the EEE modes we want to prohibit. We will ask * the PHY stop advertising these mode later on */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 19d83e112beb..36bf0bbc8efa 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -575,6 +575,8 @@ struct macsec_ops; * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @supported_eee: supported PHY EEE linkmodes + * @advertising_eee: Currently advertised EEE linkmodes + * @eee_enabled: Flag indicating whether the EEE feature is enabled * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited @@ -681,6 +683,8 @@ struct phy_device { __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); /* used for eee validation */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); + __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); + bool eee_enabled; /* Host supported PHY interface types. Should be ignored if empty. */ DECLARE_PHY_INTERFACE_MASK(host_interfaces); @@ -1766,6 +1770,7 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); +int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; From 186b1da76b7271d65b48af5abae10a53d679c776 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Feb 2023 06:50:43 +0100 Subject: [PATCH 17/53] net: phy: c45: genphy_c45_ethtool_set_eee: validate EEE link modes Currently, it is possible to let some PHYs to advertise not supported EEE link modes. So, validate them before overwriting existing configuration. Signed-off-by: Oleksij Rempel Reviewed-by: Russell King (Oracle) Signed-off-by: Paolo Abeni --- drivers/net/phy/phy-c45.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 8717c122e2f3..3813b86689d0 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -1438,12 +1438,23 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev, int ret; if (data->eee_enabled) { - if (data->advertised) + if (data->advertised) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(adv); + + ethtool_convert_legacy_u32_to_link_mode(adv, + data->advertised); + linkmode_andnot(adv, adv, phydev->supported_eee); + if (!linkmode_empty(adv)) { + phydev_warn(phydev, "At least some EEE link modes are not supported.\n"); + return -EINVAL; + } + ethtool_convert_legacy_u32_to_link_mode(phydev->advertising_eee, data->advertised); - else + } else { linkmode_copy(phydev->advertising_eee, phydev->supported_eee); + } phydev->eee_enabled = true; } else { From edea0c5a994b7829c9ada8f5bc762c4e32f4f797 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Wed, 22 Feb 2023 17:06:00 +0530 Subject: [PATCH 18/53] octeontx2-pf: Recalculate UDP checksum for ptp 1-step sync packet When checksum offload is disabled in the driver via ethtool, the PTP 1-step sync packets contain incorrect checksum, since the stack calculates the checksum before driver updates PTP timestamp field in the packet. This results in PTP packets getting dropped at the other end. This patch fixes the issue by re-calculating the UDP checksum after updating PTP timestamp field in the driver. Fixes: 2958d17a8984 ("octeontx2-pf: Add support for ptp 1-step mode on CN10K silicon") Signed-off-by: Geetha sowjanya Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: Sai Krishna Link: https://lore.kernel.org/r/20230222113600.1965116-1-saikrishnag@marvell.com Signed-off-by: Paolo Abeni --- .../marvell/octeontx2/nic/otx2_txrx.c | 76 ++++++++++++++----- 1 file changed, 57 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index ef10aef3cda0..7045fedfd73a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "otx2_reg.h" #include "otx2_common.h" @@ -699,7 +700,7 @@ static void otx2_sqe_add_ext(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, static void otx2_sqe_add_mem(struct otx2_snd_queue *sq, int *offset, int alg, u64 iova, int ptp_offset, - u64 base_ns, int udp_csum) + u64 base_ns, bool udp_csum_crt) { struct nix_sqe_mem_s *mem; @@ -711,7 +712,7 @@ static void otx2_sqe_add_mem(struct otx2_snd_queue *sq, int *offset, if (ptp_offset) { mem->start_offset = ptp_offset; - mem->udp_csum_crt = udp_csum; + mem->udp_csum_crt = !!udp_csum_crt; mem->base_ns = base_ns; mem->step_type = 1; } @@ -986,10 +987,11 @@ static bool otx2_validate_network_transport(struct sk_buff *skb) return false; } -static bool otx2_ptp_is_sync(struct sk_buff *skb, int *offset, int *udp_csum) +static bool otx2_ptp_is_sync(struct sk_buff *skb, int *offset, bool *udp_csum_crt) { struct ethhdr *eth = (struct ethhdr *)(skb->data); u16 nix_offload_hlen = 0, inner_vhlen = 0; + bool udp_hdr_present = false, is_sync; u8 *data = skb->data, *msgtype; __be16 proto = eth->h_proto; int network_depth = 0; @@ -1029,45 +1031,81 @@ static bool otx2_ptp_is_sync(struct sk_buff *skb, int *offset, int *udp_csum) if (!otx2_validate_network_transport(skb)) return false; - *udp_csum = 1; *offset = nix_offload_hlen + skb_transport_offset(skb) + sizeof(struct udphdr); + udp_hdr_present = true; + } msgtype = data + *offset; - /* Check PTP messageId is SYNC or not */ - return (*msgtype & 0xf) == 0; + is_sync = !(*msgtype & 0xf); + if (is_sync) + *udp_csum_crt = udp_hdr_present; + else + *offset = 0; + + return is_sync; } static void otx2_set_txtstamp(struct otx2_nic *pfvf, struct sk_buff *skb, struct otx2_snd_queue *sq, int *offset) { + struct ethhdr *eth = (struct ethhdr *)(skb->data); struct ptpv2_tstamp *origin_tstamp; - int ptp_offset = 0, udp_csum = 0; + bool udp_csum_crt = false; + unsigned int udphoff; struct timespec64 ts; + int ptp_offset = 0; + __wsum skb_csum; u64 iova; if (unlikely(!skb_shinfo(skb)->gso_size && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) { - if (unlikely(pfvf->flags & OTX2_FLAG_PTP_ONESTEP_SYNC)) { - if (otx2_ptp_is_sync(skb, &ptp_offset, &udp_csum)) { - origin_tstamp = (struct ptpv2_tstamp *) - ((u8 *)skb->data + ptp_offset + - PTP_SYNC_SEC_OFFSET); - ts = ns_to_timespec64(pfvf->ptp->tstamp); - origin_tstamp->seconds_msb = htons((ts.tv_sec >> 32) & 0xffff); - origin_tstamp->seconds_lsb = htonl(ts.tv_sec & 0xffffffff); - origin_tstamp->nanoseconds = htonl(ts.tv_nsec); - /* Point to correction field in PTP packet */ - ptp_offset += 8; + if (unlikely(pfvf->flags & OTX2_FLAG_PTP_ONESTEP_SYNC && + otx2_ptp_is_sync(skb, &ptp_offset, &udp_csum_crt))) { + origin_tstamp = (struct ptpv2_tstamp *) + ((u8 *)skb->data + ptp_offset + + PTP_SYNC_SEC_OFFSET); + ts = ns_to_timespec64(pfvf->ptp->tstamp); + origin_tstamp->seconds_msb = htons((ts.tv_sec >> 32) & 0xffff); + origin_tstamp->seconds_lsb = htonl(ts.tv_sec & 0xffffffff); + origin_tstamp->nanoseconds = htonl(ts.tv_nsec); + /* Point to correction field in PTP packet */ + ptp_offset += 8; + + /* When user disables hw checksum, stack calculates the csum, + * but it does not cover ptp timestamp which is added later. + * Recalculate the checksum manually considering the timestamp. + */ + if (udp_csum_crt) { + struct udphdr *uh = udp_hdr(skb); + + if (skb->ip_summed != CHECKSUM_PARTIAL && uh->check != 0) { + udphoff = skb_transport_offset(skb); + uh->check = 0; + skb_csum = skb_checksum(skb, udphoff, skb->len - udphoff, + 0); + if (ntohs(eth->h_proto) == ETH_P_IPV6) + uh->check = csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb_csum); + else + uh->check = csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + IPPROTO_UDP, + skb_csum); + } } } else { skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; } iova = sq->timestamps->iova + (sq->head * sizeof(u64)); otx2_sqe_add_mem(sq, offset, NIX_SENDMEMALG_E_SETTSTMP, iova, - ptp_offset, pfvf->ptp->base_ns, udp_csum); + ptp_offset, pfvf->ptp->base_ns, udp_csum_crt); } else { skb_tx_timestamp(skb); } From ee0a735fd97ccde766ab557d1fc722c92cebacda Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Wed, 22 Feb 2023 15:42:41 -0500 Subject: [PATCH 19/53] net: sunhme: Fix region request devm_request_region is for I/O regions. Use devm_request_mem_region instead. This fixes the driver failing to probe since 99df45c9e0a4 ("sunhme: fix an IS_ERR() vs NULL check in probe"), which checked the result. Fixes: 914d9b2711dd ("sunhme: switch to devres") Signed-off-by: Sean Anderson Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20230222204242.2658247-1-seanga2@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sun/sunhme.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index 1c16548415cd..b0c7ab74a82e 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -2894,8 +2894,10 @@ static int happy_meal_pci_probe(struct pci_dev *pdev, goto err_out_clear_quattro; } - hpreg_res = devm_request_region(&pdev->dev, pci_resource_start(pdev, 0), - pci_resource_len(pdev, 0), DRV_NAME); + hpreg_res = devm_request_mem_region(&pdev->dev, + pci_resource_start(pdev, 0), + pci_resource_len(pdev, 0), + DRV_NAME); if (!hpreg_res) { err = -EBUSY; dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting.\n"); From 68ba44639537de6f91fe32783766322d41848127 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 22 Feb 2023 12:07:21 -0500 Subject: [PATCH 20/53] sctp: add a refcnt in sctp_stream_priorities to avoid a nested loop With this refcnt added in sctp_stream_priorities, we don't need to traverse all streams to check if the prio is used by other streams when freeing one stream's prio in sctp_sched_prio_free_sid(). This can avoid a nested loop (up to 65535 * 65535), which may cause a stuck as Ying reported: watchdog: BUG: soft lockup - CPU#23 stuck for 26s! [ksoftirqd/23:136] Call Trace: sctp_sched_prio_free_sid+0xab/0x100 [sctp] sctp_stream_free_ext+0x64/0xa0 [sctp] sctp_stream_free+0x31/0x50 [sctp] sctp_association_free+0xa5/0x200 [sctp] Note that it doesn't need to use refcount_t type for this counter, as its accessing is always protected under the sock lock. v1->v2: - add a check in sctp_sched_prio_set to avoid the possible prio_head refcnt overflow. Fixes: 9ed7bfc79542 ("sctp: fix memory leak in sctp_stream_outq_migrate()") Reported-by: Ying Xu Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Link: https://lore.kernel.org/r/825eb0c905cb864991eba335f4a2b780e543f06b.1677085641.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sctp/structs.h | 1 + net/sctp/stream_sched_prio.c | 52 +++++++++++++++--------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index afa3781e3ca2..e1f6e7fc2b11 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1412,6 +1412,7 @@ struct sctp_stream_priorities { /* The next stream in line */ struct sctp_stream_out_ext *next; __u16 prio; + __u16 users; }; struct sctp_stream_out_ext { diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 42d4800f263d..4d4d9da331f4 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -25,6 +25,18 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); +static struct sctp_stream_priorities *sctp_sched_prio_head_get(struct sctp_stream_priorities *p) +{ + p->users++; + return p; +} + +static void sctp_sched_prio_head_put(struct sctp_stream_priorities *p) +{ + if (p && --p->users == 0) + kfree(p); +} + static struct sctp_stream_priorities *sctp_sched_prio_new_head( struct sctp_stream *stream, int prio, gfp_t gfp) { @@ -38,6 +50,7 @@ static struct sctp_stream_priorities *sctp_sched_prio_new_head( INIT_LIST_HEAD(&p->active); p->next = NULL; p->prio = prio; + p->users = 1; return p; } @@ -53,7 +66,7 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head( */ list_for_each_entry(p, &stream->prio_list, prio_sched) { if (p->prio == prio) - return p; + return sctp_sched_prio_head_get(p); if (p->prio > prio) break; } @@ -70,7 +83,7 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head( */ break; if (p->prio == prio) - return p; + return sctp_sched_prio_head_get(p); } /* If not even there, allocate a new one. */ @@ -154,32 +167,21 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, struct sctp_stream_out_ext *soute = sout->ext; struct sctp_stream_priorities *prio_head, *old; bool reschedule = false; - int i; + + old = soute->prio_head; + if (old && old->prio == prio) + return 0; prio_head = sctp_sched_prio_get_head(stream, prio, gfp); if (!prio_head) return -ENOMEM; reschedule = sctp_sched_prio_unsched(soute); - old = soute->prio_head; soute->prio_head = prio_head; if (reschedule) sctp_sched_prio_sched(stream, soute); - if (!old) - /* Happens when we set the priority for the first time */ - return 0; - - for (i = 0; i < stream->outcnt; i++) { - soute = SCTP_SO(stream, i)->ext; - if (soute && soute->prio_head == old) - /* It's still in use, nothing else to do here. */ - return 0; - } - - /* No hits, we are good to free it. */ - kfree(old); - + sctp_sched_prio_head_put(old); return 0; } @@ -206,20 +208,8 @@ static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid) { - struct sctp_stream_priorities *prio = SCTP_SO(stream, sid)->ext->prio_head; - int i; - - if (!prio) - return; - + sctp_sched_prio_head_put(SCTP_SO(stream, sid)->ext->prio_head); SCTP_SO(stream, sid)->ext->prio_head = NULL; - for (i = 0; i < stream->outcnt; i++) { - if (SCTP_SO(stream, i)->ext && - SCTP_SO(stream, i)->ext->prio_head == prio) - return; - } - - kfree(prio); } static void sctp_sched_prio_enqueue(struct sctp_outq *q, From 3acd9db9293f3b33ac04e8d44ed05b604ad1ac26 Mon Sep 17 00:00:00 2001 From: Deepak R Varma Date: Wed, 22 Feb 2023 18:58:48 +0530 Subject: [PATCH 21/53] octeontx2-pf: Use correct struct reference in test condition Fix the typo/copy-paste error by replacing struct variable ah_esp_mask name by ah_esp_hdr. Issue identified using doublebitand.cocci Coccinelle semantic patch. Fixes: b7cf966126eb ("octeontx2-pf: Add flow classification using IP next level protocol") Link: https://lore.kernel.org/all/20210111112537.3277-1-naveenm@marvell.com/ Signed-off-by: Deepak R Varma Link: https://lore.kernel.org/r/Y/YYkKddeHOt80cO@ubun2204.myguest.virtualbox.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c index 684cb8ec9f21..10e11262d48a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c @@ -793,7 +793,7 @@ static int otx2_prepare_ipv6_flow(struct ethtool_rx_flow_spec *fsp, /* NPC profile doesn't extract AH/ESP header fields */ if ((ah_esp_mask->spi & ah_esp_hdr->spi) || - (ah_esp_mask->tclass & ah_esp_mask->tclass)) + (ah_esp_mask->tclass & ah_esp_hdr->tclass)) return -EOPNOTSUPP; if (flow_type == AH_V6_FLOW) From a6b811cb402f4beb7c8a56345a40bf6c65bd771d Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 Feb 2023 15:01:59 -0800 Subject: [PATCH 22/53] net/mlx5e: Remove hairpin write debugfs files Per the discussion in [1], hairpin parameters will be exposed using devlink, remove the debugfs files. [1] https://lore.kernel.org/all/20230111194608.7f15b9a1@kernel.org/ Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/all/20230222230202.523667-1-saeed@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 59 ------------------- 1 file changed, 59 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index e34d9b5fb504..70b8d2dfa751 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1048,61 +1048,6 @@ static int mlx5e_hairpin_get_prio(struct mlx5e_priv *priv, return 0; } -static int debugfs_hairpin_queues_set(void *data, u64 val) -{ - struct mlx5e_hairpin_params *hp = data; - - if (!val) { - mlx5_core_err(hp->mdev, - "Number of hairpin queues must be > 0\n"); - return -EINVAL; - } - - hp->num_queues = val; - - return 0; -} - -static int debugfs_hairpin_queues_get(void *data, u64 *val) -{ - struct mlx5e_hairpin_params *hp = data; - - *val = hp->num_queues; - - return 0; -} -DEFINE_DEBUGFS_ATTRIBUTE(fops_hairpin_queues, debugfs_hairpin_queues_get, - debugfs_hairpin_queues_set, "%llu\n"); - -static int debugfs_hairpin_queue_size_set(void *data, u64 val) -{ - struct mlx5e_hairpin_params *hp = data; - - if (val > BIT(MLX5_CAP_GEN(hp->mdev, log_max_hairpin_num_packets))) { - mlx5_core_err(hp->mdev, - "Invalid hairpin queue size, must be <= %lu\n", - BIT(MLX5_CAP_GEN(hp->mdev, - log_max_hairpin_num_packets))); - return -EINVAL; - } - - hp->queue_size = roundup_pow_of_two(val); - - return 0; -} - -static int debugfs_hairpin_queue_size_get(void *data, u64 *val) -{ - struct mlx5e_hairpin_params *hp = data; - - *val = hp->queue_size; - - return 0; -} -DEFINE_DEBUGFS_ATTRIBUTE(fops_hairpin_queue_size, - debugfs_hairpin_queue_size_get, - debugfs_hairpin_queue_size_set, "%llu\n"); - static int debugfs_hairpin_num_active_get(void *data, u64 *val) { struct mlx5e_tc_table *tc = data; @@ -1148,10 +1093,6 @@ static void mlx5e_tc_debugfs_init(struct mlx5e_tc_table *tc, tc->dfs_root = debugfs_create_dir("tc", dfs_root); - debugfs_create_file("hairpin_num_queues", 0644, tc->dfs_root, - &tc->hairpin_params, &fops_hairpin_queues); - debugfs_create_file("hairpin_queue_size", 0644, tc->dfs_root, - &tc->hairpin_params, &fops_hairpin_queue_size); debugfs_create_file("hairpin_num_active", 0444, tc->dfs_root, tc, &fops_hairpin_num_active); debugfs_create_file("hairpin_table_dump", 0444, tc->dfs_root, tc, From 1862de92c81c2a74ff05819aca20b0b83192c83b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 23 Feb 2023 09:26:56 +0200 Subject: [PATCH 23/53] netdev-genl: fix repeated typo oflloading -> offloading Fix a repeated copy/paste typo. Fixes: d3d854fd6a1d ("netdev-genl: create a simple family for netdev stuff") Signed-off-by: Tariq Toukan Acked-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- Documentation/netlink/specs/netdev.yaml | 2 +- include/uapi/linux/netdev.h | 2 +- tools/include/uapi/linux/netdev.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index b4dcdae54ffd..cffef09729f1 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -28,7 +28,7 @@ definitions: - name: hw-offload doc: - This feature informs if netdev supports XDP hw oflloading. + This feature informs if netdev supports XDP hw offloading. - name: rx-sg doc: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 9ee459872600..588391447bfb 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -19,7 +19,7 @@ * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP * in zero copy mode. * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw - * oflloading. + * offloading. * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear * XDP buffer support in the driver napi callback. * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 9ee459872600..588391447bfb 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -19,7 +19,7 @@ * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP * in zero copy mode. * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw - * oflloading. + * offloading. * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear * XDP buffer support in the driver napi callback. * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements From ac3ad19584b26fae9ac86e4faebe790becc74491 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Feb 2023 08:38:45 +0000 Subject: [PATCH 24/53] net: fix __dev_kfree_skb_any() vs drop monitor dev_kfree_skb() is aliased to consume_skb(). When a driver is dropping a packet by calling dev_kfree_skb_any() we should propagate the drop reason instead of pretending the packet was consumed. Note: Now we have enum skb_drop_reason we could remove enum skb_free_reason (for linux-6.4) v2: added an unlikely(), suggested by Yunsheng Lin. Fixes: e6247027e517 ("net: introduce dev_consume_skb_any()") Signed-off-by: Eric Dumazet Cc: Yunsheng Lin Reviewed-by: Yunsheng Lin Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 18dc8d75ead9..253584777101 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3134,8 +3134,10 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { if (in_hardirq() || irqs_disabled()) __dev_kfree_skb_irq(skb, reason); + else if (unlikely(reason == SKB_REASON_DROPPED)) + kfree_skb(skb); else - dev_kfree_skb(skb); + consume_skb(skb); } EXPORT_SYMBOL(__dev_kfree_skb_any); From 0483a16bc493075bb9f6483497786fed41d27b94 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 22 Feb 2023 10:29:44 +0800 Subject: [PATCH 25/53] net/mlx5: Remove NULL check before dev_{put, hold} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The call netdev_{put, hold} of dev_{put, hold} will check NULL, so there is no need to check before using dev_{put, hold}, remove it to silence the warning: ./drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c:714:2-9: WARNING: NULL check before dev_{put, hold} functions is not needed. Reported-by: Abaci Robot Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=4174 Signed-off-by: Yang Li Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index e24b46953542..8f7452dc00ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -710,8 +710,7 @@ forward: else napi_gro_receive(rq->cq.napi, skb); - if (tc_priv.fwd_dev) - dev_put(tc_priv.fwd_dev); + dev_put(tc_priv.fwd_dev); return; From bfeda9683dcd697dc84bbd5ce375f659e9590bf5 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 17 Feb 2023 11:13:01 +0800 Subject: [PATCH 26/53] net/mlx5e: TC, fix return value check in mlx5e_tc_act_stats_create() kvzalloc() returns NULL pointer not PTR_ERR() when it fails, so replace the IS_ERR() check with NULL pointer check. Fixes: d13674b1d14c ("net/mlx5e: TC, map tc action cookie to a hw counter") Signed-off-by: Yang Yingliang Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c index f71766dca660..626cb7470fa5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c @@ -37,7 +37,7 @@ mlx5e_tc_act_stats_create(void) int err; handle = kvzalloc(sizeof(*handle), GFP_KERNEL); - if (IS_ERR(handle)) + if (!handle) return ERR_PTR(-ENOMEM); err = rhashtable_init(&handle->ht, &act_counters_ht_params); From e435941b1da1a0be4ff8a7ae425774c76a5ac514 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 2 Feb 2023 09:13:54 -0800 Subject: [PATCH 27/53] mlx5: fix skb leak while fifo resync and push During ptp resync operation SKBs were poped from the fifo but were never freed neither by napi_consume nor by dev_kfree_skb_any. Add call to napi_consume_skb to properly free SKBs. Another leak was happening because mlx5e_skb_fifo_has_room() had an error in the check. Comparing free running counters works well unless C promotes the types to something wider than the counter. In this case counters are u16 but the result of the substraction is promouted to int and it causes wrong result (negative value) of the check when producer have already overlapped but consumer haven't yet. Explicit cast to u16 fixes the issue. Fixes: 58a518948f60 ("net/mlx5e: Add resiliency for PTP TX port timestamp") Reviewed-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Vadim Fedorenko Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 9a1bc93b7dc6..392db8d5e9c7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -86,7 +86,8 @@ static bool mlx5e_ptp_ts_cqe_drop(struct mlx5e_ptpsq *ptpsq, u16 skb_cc, u16 skb return (ptpsq->ts_cqe_ctr_mask && (skb_cc != skb_id)); } -static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_cc, u16 skb_id) +static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_cc, + u16 skb_id, int budget) { struct skb_shared_hwtstamps hwts = {}; struct sk_buff *skb; @@ -98,6 +99,7 @@ static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_ hwts.hwtstamp = mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp; skb_tstamp_tx(skb, &hwts); ptpsq->cq_stats->resync_cqe++; + napi_consume_skb(skb, budget); skb_cc = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc); } } @@ -119,7 +121,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, } if (mlx5e_ptp_ts_cqe_drop(ptpsq, skb_cc, skb_id)) - mlx5e_ptp_skb_fifo_ts_cqe_resync(ptpsq, skb_cc, skb_id); + mlx5e_ptp_skb_fifo_ts_cqe_resync(ptpsq, skb_cc, skb_id, budget); skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo); hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index c067d2efab51..2c10adbf1849 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -86,7 +86,7 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq); static inline bool mlx5e_skb_fifo_has_room(struct mlx5e_skb_fifo *fifo) { - return (*fifo->pc - *fifo->cc) < fifo->mask; + return (u16)(*fifo->pc - *fifo->cc) < fifo->mask; } static inline bool From 3a50cf1e8e5157b82268eee7e330dbe5736a0948 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 2 Feb 2023 09:13:55 -0800 Subject: [PATCH 28/53] mlx5: fix possible ptp queue fifo use-after-free Fifo indexes are not checked during pop operations and it leads to potential use-after-free when poping from empty queue. Such case was possible during re-sync action. WARN_ON_ONCE covers future cases. There were out-of-order cqe spotted which lead to drain of the queue and use-after-free because of lack of fifo pointers check. Special check and counter are added to avoid resync operation if SKB could not exist in the fifo because of OOO cqe (skb_id must be between consumer and producer index). Fixes: 58a518948f60 ("net/mlx5e: Add resiliency for PTP TX port timestamp") Signed-off-by: Vadim Fedorenko Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/ptp.c | 19 ++++++++++++++++++- .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 2 ++ .../ethernet/mellanox/mlx5/core/en_stats.c | 1 + .../ethernet/mellanox/mlx5/core/en_stats.h | 1 + 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 392db8d5e9c7..eb5aeba3addf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -86,6 +86,17 @@ static bool mlx5e_ptp_ts_cqe_drop(struct mlx5e_ptpsq *ptpsq, u16 skb_cc, u16 skb return (ptpsq->ts_cqe_ctr_mask && (skb_cc != skb_id)); } +static bool mlx5e_ptp_ts_cqe_ooo(struct mlx5e_ptpsq *ptpsq, u16 skb_id) +{ + u16 skb_cc = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc); + u16 skb_pc = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_pc); + + if (PTP_WQE_CTR2IDX(skb_id - skb_cc) >= PTP_WQE_CTR2IDX(skb_pc - skb_cc)) + return true; + + return false; +} + static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_cc, u16 skb_id, int budget) { @@ -120,8 +131,14 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, goto out; } - if (mlx5e_ptp_ts_cqe_drop(ptpsq, skb_cc, skb_id)) + if (mlx5e_ptp_ts_cqe_drop(ptpsq, skb_cc, skb_id)) { + if (mlx5e_ptp_ts_cqe_ooo(ptpsq, skb_id)) { + /* already handled by a previous resync */ + ptpsq->cq_stats->ooo_cqe_drop++; + return; + } mlx5e_ptp_skb_fifo_ts_cqe_resync(ptpsq, skb_cc, skb_id, budget); + } skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo); hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 2c10adbf1849..b9c2f67d3794 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -302,6 +302,8 @@ void mlx5e_skb_fifo_push(struct mlx5e_skb_fifo *fifo, struct sk_buff *skb) static inline struct sk_buff *mlx5e_skb_fifo_pop(struct mlx5e_skb_fifo *fifo) { + WARN_ON_ONCE(*fifo->pc == *fifo->cc); + return *mlx5e_skb_fifo_get(fifo, (*fifo->cc)++); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 6687b8136e44..4478223c1720 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -2138,6 +2138,7 @@ static const struct counter_desc ptp_cq_stats_desc[] = { { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, abort_abs_diff_ns) }, { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, resync_cqe) }, { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, resync_event) }, + { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, ooo_cqe_drop) }, }; static const struct counter_desc ptp_rq_stats_desc[] = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 375752d6546d..b77100b60b50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -461,6 +461,7 @@ struct mlx5e_ptp_cq_stats { u64 abort_abs_diff_ns; u64 resync_cqe; u64 resync_event; + u64 ooo_cqe_drop; }; struct mlx5e_rep_stats { From e1ed30c8c09abc85a01c897845bdbd08c0333353 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 15 Feb 2023 20:12:05 +0200 Subject: [PATCH 29/53] net/mlx5: ECPF, wait for VF pages only after disabling host PFs Currently, during the early stages of their unloading, particularly during SRIOV disablement, PFs/ECPFs wait on the release of all of their VFs memory pages. Furthermore, ECPFs are considered the page supplier for host VFs, hence the host VFs memory pages are freed only during ECPF cleanup when host interfaces get disabled. Thus, disabling SRIOV early in unload timeline causes the DPU ECPF to stall on driver unload while waiting on the release of host VF pages that won't be freed before host interfaces get disabled later on. Therefore, for ECPFs, wait on the release of VFs pages only after the disablement of host PFs during ECPF cleanup flow. Then, host PFs and VFs are disabled and their memory shall be freed accordingly. Fixes: 143a41d7623d ("net/mlx5: Disable SRIOV before PF removal") Signed-off-by: Maher Sanalla Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/ecpf.c | 4 ++++ drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index 9a3878f9e582..7c9c4e40c019 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -98,4 +98,8 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev) err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]); if (err) mlx5_core_warn(dev, "Timeout reclaiming external host PF pages err(%d)\n", err); + + err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_VF]); + if (err) + mlx5_core_warn(dev, "Timeout reclaiming external host VFs pages err(%d)\n", err); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 3008e9ce2bbf..20d7662c10fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -147,6 +147,10 @@ mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf) mlx5_eswitch_disable_sriov(dev->priv.eswitch, clear_vf); + /* For ECPFs, skip waiting for host VF pages until ECPF is destroyed */ + if (mlx5_core_is_ecpf(dev)) + return; + if (mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_VF])) mlx5_core_warn(dev, "timeout reclaiming VFs pages\n"); } From 1bf8b0dae8dde6f02520a5ea34fdaa3b39342e69 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 31 Jan 2023 12:04:30 +0200 Subject: [PATCH 30/53] net/mlx5e: Verify flow_source cap before using it When adding send to vport rule verify flow_source matching is supported by checking the flow_source cap. Fixes: d04442540372 ("net/mlx5: E-Switch, set flow source for send to uplink rule") Signed-off-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 2a98375a0abf..d766a64b1823 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -869,7 +869,8 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - if (rep->vport == MLX5_VPORT_UPLINK) + if (MLX5_CAP_ESW_FLOWTABLE(on_esw->dev, flow_source) && + rep->vport == MLX5_VPORT_UPLINK) spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; flow_rule = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(on_esw), From d28a06d7dbedc598a06bd1e53a28125f87ca5d0c Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Wed, 8 Feb 2023 17:44:06 +0200 Subject: [PATCH 31/53] net/mlx5: Geneve, Fix handling of Geneve object id as error code On success, mlx5_geneve_tlv_option_create returns non negative Geneve object id. In case the object id is positive value the caller functions will handle it as an error (non zero) and will fail to offload the Geneve rule. Fix this by changing caller function ,mlx5_geneve_tlv_option_add, to return 0 in case valid non negative object id was provided. Fixes: 0ccc171ea6a2 ("net/mlx5: Geneve, Manage Geneve TLV options") Signed-off-by: Maor Dickman Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c index 23361a9ae4fa..6dc83e871cd7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c @@ -105,6 +105,7 @@ int mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *op geneve->opt_type = opt->type; geneve->obj_id = res; geneve->refcount++; + res = 0; } unlock: From f7cf644796fcdb6a0e30e4a7f218be74ac8cb31d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Feb 2023 10:31:39 -0800 Subject: [PATCH 32/53] tools: ynl-gen: fix single attribute structs with attr 0 only Chuck run into an issue with a single-element attr-set which only has an attr with value of 0. The search for max attr in a struct records attrs with value larger than 0 only (max_val is set to 0 at the start). Adjust the comparison, alternatively max_val could be init'ed to -1. Somehow picking the last attr of a value seems like a good idea in general. Reported-by: Chuck Lever III Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 3942f24b9163..274e9c566f61 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -546,7 +546,7 @@ class Struct: max_val = 0 self.attr_max_val = None for name, attr in self.attr_list: - if attr.value > max_val: + if attr.value >= max_val: max_val = attr.value self.attr_max_val = attr self.attrs[name] = attr From b9d3a3e4ae0cb7c443d46ffe413e17749baab3ba Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Feb 2023 10:31:40 -0800 Subject: [PATCH 33/53] tools: ynl-gen: re-raise the exception instead of printing traceback.print_exception() seems tricky to call, we're missing some argument, so re-raise instead. Reported-by: Chuck Lever III Fixes: 3aacf8281336 ("tools: ynl: add an object hierarchy to represent parsed spec") Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/nlspec.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index e204679ad8b7..71da568e2c28 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -3,7 +3,6 @@ import collections import importlib import os -import traceback import yaml @@ -234,8 +233,7 @@ class SpecFamily(SpecElement): resolved.append(elem) if len(resolved) == 0: - traceback.print_exception(last_exception) - raise Exception("Could not resolve any spec element, infinite loop?") + raise last_exception def new_attr_set(self, elem): return SpecAttrSet(self, elem) From d77e7eceeac9f3ab40b45649531993b456eeacd0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Feb 2023 10:31:41 -0800 Subject: [PATCH 34/53] tools: net: add __pycache__ to gitignore Python will generate its customary cache when running ynl scripts: ?? tools/net/ynl/lib/__pycache__/ Reported-by: Chuck Lever III Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/net/ynl/lib/.gitignore diff --git a/tools/net/ynl/lib/.gitignore b/tools/net/ynl/lib/.gitignore new file mode 100644 index 000000000000..c18dd8d83cee --- /dev/null +++ b/tools/net/ynl/lib/.gitignore @@ -0,0 +1 @@ +__pycache__/ From bce90459937376b6c7553cfbe957d73293ab7bc8 Mon Sep 17 00:00:00 2001 From: nick black Date: Thu, 23 Feb 2023 21:29:16 -0500 Subject: [PATCH 35/53] docs: net: fix inaccuracies in msg_zerocopy.rst Replace "sendpage" with "sendfile". Remove comment about ENOBUFS when the sockopt hasn't been set; experimentation indicates that this is not true. Signed-off-by: nick black Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/Y/gg/EhIIjugLdd3@schwarzgerat.orthanc Signed-off-by: Jakub Kicinski --- Documentation/networking/msg_zerocopy.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst index 15920db8d35d..b3ea96af9b49 100644 --- a/Documentation/networking/msg_zerocopy.rst +++ b/Documentation/networking/msg_zerocopy.rst @@ -15,7 +15,7 @@ Opportunity and Caveats Copying large buffers between user process and kernel can be expensive. Linux supports various interfaces that eschew copying, -such as sendpage and splice. The MSG_ZEROCOPY flag extends the +such as sendfile and splice. The MSG_ZEROCOPY flag extends the underlying copy avoidance mechanism to common socket send calls. Copy avoidance is not a free lunch. As implemented, with page pinning, @@ -83,8 +83,8 @@ Pass the new flag. ret = send(fd, buf, sizeof(buf), MSG_ZEROCOPY); A zerocopy failure will return -1 with errno ENOBUFS. This happens if -the socket option was not set, the socket exceeds its optmem limit or -the user exceeds its ulimit on locked pages. +the socket exceeds its optmem limit or the user exceeds their ulimit on +locked pages. Mixing copy avoidance and copying From 6f2ce45f0c5f12b5fc6ffabf367bc1bb145534cc Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 23 Feb 2023 09:39:44 -0600 Subject: [PATCH 36/53] ibmvnic: Assign XPS map to correct queue index When setting the XPS map value for TX queues, use the index of the transmit queue. Previously, the function was passing the index of the loop that iterates over all queues (RX and TX). This was causing invalid XPS map values. Fixes: 6831582937bd ("ibmvnic: Toggle between queue types in affinity mapping") Signed-off-by: Nick Child Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20230223153944.44969-1-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 146ca1d8031b..c63d3ec9d328 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -296,10 +296,10 @@ static void ibmvnic_set_affinity(struct ibmvnic_adapter *adapter) rc = __netif_set_xps_queue(adapter->netdev, cpumask_bits(queue->affinity_mask), - i, XPS_CPUS); + i_txqs - 1, XPS_CPUS); if (rc) netdev_warn(adapter->netdev, "%s: Set XPS on queue %d failed, rc = %d.\n", - __func__, i, rc); + __func__, i_txqs - 1, rc); } out: From 37e1f3acc339b28493eb3dad571c3f01b6af86f6 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 24 Feb 2023 11:18:49 -0700 Subject: [PATCH 37/53] net/sched: cls_api: Move call to tcf_exts_miss_cookie_base_destroy() When CONFIG_NET_CLS_ACT is disabled: ../net/sched/cls_api.c:141:13: warning: 'tcf_exts_miss_cookie_base_destroy' defined but not used [-Wunused-function] 141 | static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Due to the way the code is structured, it is possible for a definition of tcf_exts_miss_cookie_base_destroy() to be present without actually being used. Its single callsite is in an '#ifdef CONFIG_NET_CLS_ACT' block but a definition will always be present in the file. The version of tcf_exts_miss_cookie_base_destroy() that actually does something depends on CONFIG_NET_TC_SKB_EXT, so the stub function is used in both CONFIG_NET_CLS_ACT=n and CONFIG_NET_CLS_ACT=y + CONFIG_NET_TC_SKB_EXT=n configurations. Move the call to tcf_exts_miss_cookie_base_destroy() in tcf_exts_destroy() out of the '#ifdef CONFIG_NET_CLS_ACT', so that it always appears used to the compiler, while not changing any behavior with any of the various configuration combinations. Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Signed-off-by: Nathan Chancellor Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3569e2c3660c..2a6b6be0811b 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3241,9 +3241,9 @@ EXPORT_SYMBOL(tcf_exts_init_ex); void tcf_exts_destroy(struct tcf_exts *exts) { -#ifdef CONFIG_NET_CLS_ACT tcf_exts_miss_cookie_base_destroy(exts); +#ifdef CONFIG_NET_CLS_ACT if (exts->actions) { tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); From cf871006c01709211f2620a33de37257362a05e8 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 24 Feb 2023 13:32:41 -0800 Subject: [PATCH 38/53] ice: remove unnecessary CONFIG_ICE_GNSS CONFIG_ICE_GNSS was added by commit c7ef8221ca7d ("ice: use GNSS subsystem instead of TTY") as a way to allow the ice driver to optionally support GNSS features without forcing a dependency on CONFIG_GNSS. The original implementation of that commit at [1] used IS_REACHABLE. This was rejected by Olek at [2] with the suggested implementation of CONFIG_ICE_GNSS. Eventually after merging, Linus reported a .config which had CONFIG_ICE_GNSS = y when both GNSS = n and ICE = n. This confused him and he felt that the config option was not useful, and commented about it at [3]. CONFIG_ICE_GNSS is defined to y whenever GNSS = ICE. This results in it being set in cases where both options are not enabled. The goal of CONFIG_ICE_GNSS is to ensure that the GNSS support in the ice driver is enabled when GNSS is enabled. The complaint from Olek about the original IS_REACHABLE was due to the required IS_REACHABLE checks throughout the ice driver code and the fact that ice_gnss.c was compiled regardless of GNSS support. This can be fixed in the Makefile by using ice-$(CONFIG_GNSS) += ice_gnss.o In this case, if GNSS = m and ICE = y, we can result in some confusing behavior where GNSS support is not enabled because its not built in. See [4]. To disallow this, have CONFIG_ICE depend on GNSS || GNSS = n. This ensures that we cannot enable CONFIG_ICE as builtin while GNSS is a module. Drop CONFIG_ICE_GNSS, and replace the IS_ENABLED checks for it with checks for GNSS. Update the Makefile to add the ice_gnss.o object based on CONFIG_GNSS. This works to ensure that GNSS support can optionally be enabled, doesn't have an unnnecessary extra config option, and has Kbuild enforce the dependency such that you can't accidentally enable GNSS as a module and ICE as a builtin. [1] https://lore.kernel.org/intel-wired-lan/20221019095603.44825-1-arkadiusz.kubalewski@intel.com/ [2] https://lore.kernel.org/intel-wired-lan/20221028165706.96849-1-alexandr.lobakin@intel.com/ [3] https://lore.kernel.org/all/CAHk-=wi_410KZqHwF-WL5U7QYxnpHHHNP-3xL=g_y89XnKc-uw@mail.gmail.com/ [4] https://lore.kernel.org/netdev/20230223161309.0e439c5f@kernel.org/ Reported-by: Linus Torvalds Signed-off-by: Jacob Keller Fixes: c7ef8221ca7d ("ice: use GNSS subsystem instead of TTY") Cc: Arkadiusz Kubalewski Cc: Alexander Lobakin Cc: Jakub Kicinski Cc: Anthony Nguyen Acked-by: Jakub Kicinski Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/Kconfig | 4 +--- drivers/net/ethernet/intel/ice/Makefile | 2 +- drivers/net/ethernet/intel/ice/ice_gnss.h | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index a3c84bf05e44..c18c3b373846 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -296,6 +296,7 @@ config ICE default n depends on PCI_MSI depends on PTP_1588_CLOCK_OPTIONAL + depends on GNSS || GNSS = n select AUXILIARY_BUS select DIMLIB select NET_DEVLINK @@ -337,9 +338,6 @@ config ICE_HWTS the PTP clock driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE). -config ICE_GNSS - def_bool GNSS = y || GNSS = ICE - config FM10K tristate "Intel(R) FM10000 Ethernet Switch Host Interface Support" default n diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index f269952d207d..5d89392f969b 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -47,4 +47,4 @@ ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o ice-$(CONFIG_XDP_SOCKETS) += ice_xsk.o ice-$(CONFIG_ICE_SWITCHDEV) += ice_eswitch.o -ice-$(CONFIG_ICE_GNSS) += ice_gnss.o +ice-$(CONFIG_GNSS) += ice_gnss.o diff --git a/drivers/net/ethernet/intel/ice/ice_gnss.h b/drivers/net/ethernet/intel/ice/ice_gnss.h index 31db0701d13f..4d49e5b0b4b8 100644 --- a/drivers/net/ethernet/intel/ice/ice_gnss.h +++ b/drivers/net/ethernet/intel/ice/ice_gnss.h @@ -45,7 +45,7 @@ struct gnss_serial { struct list_head queue; }; -#if IS_ENABLED(CONFIG_ICE_GNSS) +#if IS_ENABLED(CONFIG_GNSS) void ice_gnss_init(struct ice_pf *pf); void ice_gnss_exit(struct ice_pf *pf); bool ice_gnss_is_gps_present(struct ice_hw *hw); @@ -56,5 +56,5 @@ static inline bool ice_gnss_is_gps_present(struct ice_hw *hw) { return false; } -#endif /* IS_ENABLED(CONFIG_ICE_GNSS) */ +#endif /* IS_ENABLED(CONFIG_GNSS) */ #endif /* _ICE_GNSS_H_ */ From 25ff6f8a5a3b8dc48e8abda6f013e8cc4b14ffea Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 25 Feb 2023 13:56:14 +0300 Subject: [PATCH 39/53] nfc: fix memory leak of se_io context in nfc_genl_se_io The callback context for sending/receiving APDUs to/from the selected secure element is allocated inside nfc_genl_se_io and supposed to be eventually freed in se_io_cb callback function. However, there are several error paths where the bwi_timer is not charged to call se_io_cb later, and the cb_context is leaked. The patch proposes to free the cb_context explicitly on those error paths. At the moment we can't simply check 'dev->ops->se_io()' return value as it may be negative in both cases: when the timer was charged and was not. Fixes: 5ce3f32b5264 ("NFC: netlink: SE API implementation") Reported-by: syzbot+df64c0a2e8d68e78a4fa@syzkaller.appspotmail.com Signed-off-by: Fedor Pchelkin Signed-off-by: Alexey Khoroshilov Signed-off-by: David S. Miller --- drivers/nfc/st-nci/se.c | 6 ++++++ drivers/nfc/st21nfca/se.c | 6 ++++++ net/nfc/netlink.c | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/drivers/nfc/st-nci/se.c b/drivers/nfc/st-nci/se.c index ec87dd21e054..b2f1ced8e6dd 100644 --- a/drivers/nfc/st-nci/se.c +++ b/drivers/nfc/st-nci/se.c @@ -672,6 +672,12 @@ int st_nci_se_io(struct nci_dev *ndev, u32 se_idx, ST_NCI_EVT_TRANSMIT_DATA, apdu, apdu_length); default: + /* Need to free cb_context here as at the moment we can't + * clearly indicate to the caller if the callback function + * would be called (and free it) or not. In both cases a + * negative value may be returned to the caller. + */ + kfree(cb_context); return -ENODEV; } } diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index df8d27cf2956..dae288bebcb5 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -236,6 +236,12 @@ int st21nfca_hci_se_io(struct nfc_hci_dev *hdev, u32 se_idx, ST21NFCA_EVT_TRANSMIT_DATA, apdu, apdu_length); default: + /* Need to free cb_context here as at the moment we can't + * clearly indicate to the caller if the callback function + * would be called (and free it) or not. In both cases a + * negative value may be returned to the caller. + */ + kfree(cb_context); return -ENODEV; } } diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 1fc339084d89..348bf561bc9f 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1442,7 +1442,11 @@ static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); + device_unlock(&dev->dev); + return rc; + error: + kfree(cb_context); device_unlock(&dev->dev); return rc; } From aaa3c08ee0653beaa649d4adfb27ad562641cfd8 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Fri, 24 Feb 2023 01:41:45 +0100 Subject: [PATCH 40/53] qede: avoid uninitialized entries in coal_entry array Even after commit 908d4bb7c54c ("qede: fix interrupt coalescing configuration"), some entries of the coal_entry array may theoretically be used uninitialized: 1. qede_alloc_fp_array() allocates QEDE_MAX_RSS_CNT entries for coal_entry. The initial allocation uses kcalloc, so everything is initialized. 2. The user sets a small number of queues (ethtool -L). coal_entry is reallocated for the actual small number of queues. 3. The user sets a bigger number of queues. coal_entry is reallocated bigger. The added entries are not necessarily initialized. In practice, the reallocations will actually keep using the originally allocated region of memory, but we should not rely on it. The reallocation is unnecessary. coal_entry can always have QEDE_MAX_RSS_CNT entries. Fixes: 908d4bb7c54c ("qede: fix interrupt coalescing configuration") Signed-off-by: Michal Schmidt Nacked-by: Manish Chopra Acked-by: Manish Chopra Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_main.c | 21 +++++++------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 4a3c3b5fb4a1..261f982ca40d 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -963,7 +963,6 @@ static int qede_alloc_fp_array(struct qede_dev *edev) { u8 fp_combined, fp_rx = edev->fp_num_rx; struct qede_fastpath *fp; - void *mem; int i; edev->fp_array = kcalloc(QEDE_QUEUE_CNT(edev), @@ -974,21 +973,15 @@ static int qede_alloc_fp_array(struct qede_dev *edev) } if (!edev->coal_entry) { - mem = kcalloc(QEDE_MAX_RSS_CNT(edev), - sizeof(*edev->coal_entry), GFP_KERNEL); - } else { - mem = krealloc(edev->coal_entry, - QEDE_QUEUE_CNT(edev) * sizeof(*edev->coal_entry), - GFP_KERNEL); + edev->coal_entry = kcalloc(QEDE_MAX_RSS_CNT(edev), + sizeof(*edev->coal_entry), + GFP_KERNEL); + if (!edev->coal_entry) { + DP_ERR(edev, "coalesce entry allocation failed\n"); + goto err; + } } - if (!mem) { - DP_ERR(edev, "coalesce entry allocation failed\n"); - kfree(edev->coal_entry); - goto err; - } - edev->coal_entry = mem; - fp_combined = QEDE_QUEUE_CNT(edev) - fp_rx - edev->fp_num_tx; /* Allocate the FP elements for Rx queues followed by combined and then From cf45efcb250ea788085f330db972256735f970e0 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 24 Feb 2023 00:02:17 +0100 Subject: [PATCH 41/53] wifi: mt76: usb: fix use-after-free in mt76u_free_rx_queue Fix the following use-after-free issue in mt76u_free_rx_queue routine: usb 3-3.3.4: reset high-speed USB device number 8 using xhci_hcd iwlwifi 0000:05:00.0: Detected RF HR B3, rfid=0x10a100 iwlwifi 0000:05:00.0: base HW address: 50:eb:71:79:02:57 iwlwifi 0000:05:00.0 wlp5s0: renamed from wlan0 mt76x2u 3-3.3.4:1.0: ASIC revision: 76320044 usb 3-3.3.1: 1:3 : unsupported format bits 0x100000000 mt76x2u 3-3.3.4:1.0: could not get hardware semaphore for ROM PATCH ------------[ cut here ]------------ refcount_t: underflow; use-after-free. WARNING: CPU: 13 PID: 983 at lib/refcount.c:28 refcount_warn_saturate+0xba/0x110 Modules linked in: snd_seq_midi snd_seq_midi_event mt76x2u(+) mt76x2_common mt76x02_usb mt76_usb iwlmvm mt76x02_lib mt76 snd_hda_codec_realtek intel_rapl_msr snd_hda_codec_generic snd_hda_codec_hdmi intel_rapl_common snd_hda_intel mac80211 snd_intel_dspcfg snd_usb_audio(+) snd_intel_sdw_acpi btusb edac_mce_amd snd_hda_codec btrtl btbcm snd_usbmidi_lib snd_hda_core btintel snd_rawmidi btmtk snd_hwdep libarc4 mc iwlwifi kvm_amd snd_seq vfat bluetooth eeepc_wmi asus_ec_sensors snd_seq_device fat kvm cfg80211 asus_wmi snd_pcm irqbypass ledtrig_audio sparse_keymap rapl wmi_bmof platform_profile xpad snd_timer k10temp ff_memless i2c_piix4 rfkill snd joydev soundcore acpi_cpufreq loop zram amdgpu crct10dif_pclmul crc32_pclmul crc32c_intel polyval_clmulni polyval_generic drm_ttm_helper ttm video iommu_v2 ucsi_ccg drm_buddy gpu_sched typec_ucsi ghash_clmulni_intel drm_display_helper igb sha512_ssse3 typec ccp nvme cec sp5100_tco nvme_core dca nvme_common wmi ip6_tables ip_tables fuse BTRFS info (device nvme1n1): enabling ssd optimizations CPU: 13 PID: 983 Comm: (udev-worker) Tainted: G W L ------- --- 6.3.0-0.rc0.20230222git5b7c4cabbb65.3.fc39.x86_64+debug BTRFS info (device nvme1n1): auto enabling async discard Hardware name: System manufacturer System Product Name/ROG STRIX X570-I GAMING, BIOS 4601 02/02/2023 RIP: 0010:refcount_warn_saturate+0xba/0x110 Code: 01 01 e8 69 a6 83 ff 0f 0b e9 52 f4 85 00 80 3d 69 6f ec 01 00 75 85 48 c7 c7 d0 25 b3 a9 c6 05 59 6f ec 01 01 e8 46 a6 83 ff <0f> 0b e9 2f f4 85 00 80 3d 47 6f ec 01 00 0f 85 5e ff ff ff 48 c7 RSP: 0018:ffffb4010456fb78 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000080000000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: ffffffffa9b17e3e RDI: 00000000ffffffff RBP: ffff8d15877336c0 R08: 0000000000000000 R09: ffffb4010456fa00 R10: 0000000000000003 R11: ffff8d246e2fffe8 R12: 0000000000000080 R13: ffff8d15b42fd000 R14: 0000000000000000 R15: ffff8d1587736a58 FS: 00007fc05ae34940(0000) GS:ffff8d2425e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055d801f1d540 CR3: 000000011df60000 CR4: 0000000000350ee0 Call Trace: mt76u_queues_deinit+0x2a0/0x370 [mt76_usb] mt76x2u_probe+0xf3/0x130 [mt76x2u] usb_probe_interface+0xe8/0x300 really_probe+0x1b6/0x410 __driver_probe_device+0x78/0x170 driver_probe_device+0x1f/0x90 __driver_attach+0xd2/0x1c0 ? __pfx___driver_attach+0x10/0x10 bus_for_each_dev+0x8a/0xd0 bus_add_driver+0x141/0x230 driver_register+0x77/0x120 usb_register_driver+0xaf/0x170 ? __pfx_init_module+0x10/0x10 [mt76x2u] do_one_initcall+0x6e/0x350 do_init_module+0x4a/0x220 __do_sys_init_module+0x192/0x1c0 ? lock_is_held_type+0xce/0x120 do_syscall_64+0x5b/0x80 ? lock_is_held_type+0xce/0x120 ? asm_exc_page_fault+0x22/0x30 ? lockdep_hardirqs_on+0x7d/0x100 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fc05b1351be Code: 48 8b 0d 4d 0c 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 1a 0c 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffd947c0988 EFLAGS: 00000246 ORIG_RAX: 00000000000000af RAX: ffffffffffffffda RBX: 000055d801f2b090 RCX: 00007fc05b1351be RDX: 00007fc05b65c07d RSI: 00000000000234be RDI: 000055d802c6b170 RBP: 00007ffd947c0a40 R08: 000055d8019b4690 R09: 0000000000022000 R10: 000000055d8019b4 R11: 0000000000000246 R12: 00007fc05b65c07d R13: 0000000000020000 R14: 000055d801f39770 R15: 000055d801f47780 irq event stamp: 186313 hardirqs last enabled at (186323): [] __up_console_sem+0x5e/0x70 hardirqs last disabled at (186332): [] __up_console_sem+0x43/0x70 softirqs last enabled at (186022): [] __irq_exit_rcu+0xd7/0x160 softirqs last disabled at (186017): [] __irq_exit_rcu+0xd7/0x160 ---[ end trace 0000000000000000 ]--- mt76x2u: probe of 3-3.3.4:1.0 failed with error -110 usbcore: registered new interface driver mt76x2u kauditd_printk_skb: 32 callbacks suppressed Fixes: 2f5c3c77fc9b ("wifi: mt76: switch to page_pool allocator") Tested-by: Mikhail Gavrilov Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/f2398f68011c976510c81e1964975b677e65860e.1677193208.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c index b88959ef38aa..5e5c7bf51174 100644 --- a/drivers/net/wireless/mediatek/mt76/usb.c +++ b/drivers/net/wireless/mediatek/mt76/usb.c @@ -706,6 +706,7 @@ mt76u_free_rx_queue(struct mt76_dev *dev, struct mt76_queue *q) q->entry[i].urb = NULL; } page_pool_destroy(q->page_pool); + q->page_pool = NULL; } static void mt76u_free_rx(struct mt76_dev *dev) From 52fd90638a7269be2a6f6cf1e4dea6724f8e13b6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Feb 2023 13:59:34 +0100 Subject: [PATCH 42/53] wifi: wext: warn about usage only once Warn only once since the ratelimit parameters are still allowing too many messages to happen. This will no longer tell you all the different processes, but still gives a heads-up of sorts. Also modify the message to note that wext stops working for future Wi-Fi 7 hardware, this is already implemented in commit 4ca69027691a ("wifi: wireless: deny wireless extensions on MLO-capable devices") and is maybe of more relevance to users than the fact that we'd like to have wireless extensions deprecated. The issue with Wi-Fi 7 is that you can now have multiple connections to the same AP, so a whole bunch of things now become per link rather than per netdev, which can't really be handled in wireless extensions. Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20230224135933.94104aeda1a0.Ie771c6a66d7d6c3cf67da5f3b0c66cea66fd514c@changeid --- net/wireless/wext-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 13a72b17248e..a125fd1fa134 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -641,8 +641,8 @@ static void wireless_warn_cfg80211_wext(void) { char name[sizeof(current->comm)]; - pr_warn_ratelimited("warning: `%s' uses wireless extensions that are deprecated for modern drivers; use nl80211\n", - get_task_comm(name, current)); + pr_warn_once("warning: `%s' uses wireless extensions which will stop working for Wi-Fi 7 hardware; use nl80211\n", + get_task_comm(name, current)); } #endif From e9e42292ea76a8358b0c02ffd530d78e133a1b73 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 24 Feb 2023 12:00:56 -0300 Subject: [PATCH 43/53] net/sched: act_pedit: fix action bind logic The TC architecture allows filters and actions to be created independently. In filters the user can reference action objects using: tc action add action pedit ... index 1 tc filter add ... action pedit index 1 In the current code for act_pedit this is broken as it checks netlink attributes for create/update before actually checking if we are binding to an existing action. tdc results: 1..69 ok 1 319a - Add pedit action that mangles IP TTL ok 2 7e67 - Replace pedit action with invalid goto chain ok 3 377e - Add pedit action with RAW_OP offset u32 ok 4 a0ca - Add pedit action with RAW_OP offset u32 (INVALID) ok 5 dd8a - Add pedit action with RAW_OP offset u16 u16 ok 6 53db - Add pedit action with RAW_OP offset u16 (INVALID) ok 7 5c7e - Add pedit action with RAW_OP offset u8 add value ok 8 2893 - Add pedit action with RAW_OP offset u8 quad ok 9 3a07 - Add pedit action with RAW_OP offset u8-u16-u8 ok 10 ab0f - Add pedit action with RAW_OP offset u16-u8-u8 ok 11 9d12 - Add pedit action with RAW_OP offset u32 set u16 clear u8 invert ok 12 ebfa - Add pedit action with RAW_OP offset overflow u32 (INVALID) ok 13 f512 - Add pedit action with RAW_OP offset u16 at offmask shift set ok 14 c2cb - Add pedit action with RAW_OP offset u32 retain value ok 15 1762 - Add pedit action with RAW_OP offset u8 clear value ok 16 bcee - Add pedit action with RAW_OP offset u8 retain value ok 17 e89f - Add pedit action with RAW_OP offset u16 retain value ok 18 c282 - Add pedit action with RAW_OP offset u32 clear value ok 19 c422 - Add pedit action with RAW_OP offset u16 invert value ok 20 d3d3 - Add pedit action with RAW_OP offset u32 invert value ok 21 57e5 - Add pedit action with RAW_OP offset u8 preserve value ok 22 99e0 - Add pedit action with RAW_OP offset u16 preserve value ok 23 1892 - Add pedit action with RAW_OP offset u32 preserve value ok 24 4b60 - Add pedit action with RAW_OP negative offset u16/u32 set value ok 25 a5a7 - Add pedit action with LAYERED_OP eth set src ok 26 86d4 - Add pedit action with LAYERED_OP eth set src & dst ok 27 f8a9 - Add pedit action with LAYERED_OP eth set dst ok 28 c715 - Add pedit action with LAYERED_OP eth set src (INVALID) ok 29 8131 - Add pedit action with LAYERED_OP eth set dst (INVALID) ok 30 ba22 - Add pedit action with LAYERED_OP eth type set/clear sequence ok 31 dec4 - Add pedit action with LAYERED_OP eth set type (INVALID) ok 32 ab06 - Add pedit action with LAYERED_OP eth add type ok 33 918d - Add pedit action with LAYERED_OP eth invert src ok 34 a8d4 - Add pedit action with LAYERED_OP eth invert dst ok 35 ee13 - Add pedit action with LAYERED_OP eth invert type ok 36 7588 - Add pedit action with LAYERED_OP ip set src ok 37 0fa7 - Add pedit action with LAYERED_OP ip set dst ok 38 5810 - Add pedit action with LAYERED_OP ip set src & dst ok 39 1092 - Add pedit action with LAYERED_OP ip set ihl & dsfield ok 40 02d8 - Add pedit action with LAYERED_OP ip set ttl & protocol ok 41 3e2d - Add pedit action with LAYERED_OP ip set ttl (INVALID) ok 42 31ae - Add pedit action with LAYERED_OP ip ttl clear/set ok 43 486f - Add pedit action with LAYERED_OP ip set duplicate fields ok 44 e790 - Add pedit action with LAYERED_OP ip set ce, df, mf, firstfrag, nofrag fields ok 45 cc8a - Add pedit action with LAYERED_OP ip set tos ok 46 7a17 - Add pedit action with LAYERED_OP ip set precedence ok 47 c3b6 - Add pedit action with LAYERED_OP ip add tos ok 48 43d3 - Add pedit action with LAYERED_OP ip add precedence ok 49 438e - Add pedit action with LAYERED_OP ip clear tos ok 50 6b1b - Add pedit action with LAYERED_OP ip clear precedence ok 51 824a - Add pedit action with LAYERED_OP ip invert tos ok 52 106f - Add pedit action with LAYERED_OP ip invert precedence ok 53 6829 - Add pedit action with LAYERED_OP beyond ip set dport & sport ok 54 afd8 - Add pedit action with LAYERED_OP beyond ip set icmp_type & icmp_code ok 55 3143 - Add pedit action with LAYERED_OP beyond ip set dport (INVALID) ok 56 815c - Add pedit action with LAYERED_OP ip6 set src ok 57 4dae - Add pedit action with LAYERED_OP ip6 set dst ok 58 fc1f - Add pedit action with LAYERED_OP ip6 set src & dst ok 59 6d34 - Add pedit action with LAYERED_OP ip6 dst retain value (INVALID) ok 60 94bb - Add pedit action with LAYERED_OP ip6 traffic_class ok 61 6f5e - Add pedit action with LAYERED_OP ip6 flow_lbl ok 62 6795 - Add pedit action with LAYERED_OP ip6 set payload_len, nexthdr, hoplimit ok 63 1442 - Add pedit action with LAYERED_OP tcp set dport & sport ok 64 b7ac - Add pedit action with LAYERED_OP tcp sport set (INVALID) ok 65 cfcc - Add pedit action with LAYERED_OP tcp flags set ok 66 3bc4 - Add pedit action with LAYERED_OP tcp set dport, sport & flags fields ok 67 f1c8 - Add pedit action with LAYERED_OP udp set dport & sport ok 68 d784 - Add pedit action with mixed RAW/LAYERED_OP #1 ok 69 70ca - Add pedit action with mixed RAW/LAYERED_OP #2 Fixes: 71d0ed7079df ("net/act_pedit: Support using offset relative to the conventional network headers") Fixes: f67169fef8db ("net/sched: act_pedit: fix WARN() in the traffic path") Reviewed-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 58 +++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 77d288d384ae..4559a1507ea5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -181,26 +181,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } parm = nla_data(pattr); - if (!parm->nkeys) { - NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); - return -EINVAL; - } - ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (nla_len(pattr) < sizeof(*parm) + ksize) { - NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); - return -EINVAL; - } - - nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); - if (!nparms) - return -ENOMEM; - - nparms->tcfp_keys_ex = - tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); - if (IS_ERR(nparms->tcfp_keys_ex)) { - ret = PTR_ERR(nparms->tcfp_keys_ex); - goto out_free; - } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); @@ -209,25 +189,49 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); - goto out_free_ex; + return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) - goto out_free; + return 0; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } } else { - ret = err; - goto out_free_ex; + return err; + } + + if (!parm->nkeys) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); + ret = -EINVAL; + goto out_release; + } + ksize = parm->nkeys * sizeof(struct tc_pedit_key); + if (nla_len(pattr) < sizeof(*parm) + ksize) { + NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); + ret = -EINVAL; + goto out_release; + } + + nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + if (!nparms) { + ret = -ENOMEM; + goto out_release; + } + + nparms->tcfp_keys_ex = + tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); + if (IS_ERR(nparms->tcfp_keys_ex)) { + ret = PTR_ERR(nparms->tcfp_keys_ex); + goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; - goto out_release; + goto out_free_ex; } nparms->tcfp_off_max_hint = 0; @@ -278,12 +282,12 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); -out_release: - tcf_idr_release(*a, bind); out_free_ex: kfree(nparms->tcfp_keys_ex); out_free: kfree(nparms); +out_release: + tcf_idr_release(*a, bind); return ret; } From e88d78a773cb5242e933930c8855bf4b2e8c2397 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 24 Feb 2023 12:00:57 -0300 Subject: [PATCH 44/53] net/sched: act_mpls: fix action bind logic The TC architecture allows filters and actions to be created independently. In filters the user can reference action objects using: tc action add action mpls ... index 1 tc filter add ... action mpls index 1 In the current code for act_mpls this is broken as it checks netlink attributes for create/update before actually checking if we are binding to an existing action. tdc results: 1..53 ok 1 a933 - Add MPLS dec_ttl action with pipe opcode ok 2 08d1 - Add mpls dec_ttl action with pass opcode ok 3 d786 - Add mpls dec_ttl action with drop opcode ok 4 f334 - Add mpls dec_ttl action with reclassify opcode ok 5 29bd - Add mpls dec_ttl action with continue opcode ok 6 48df - Add mpls dec_ttl action with jump opcode ok 7 62eb - Add mpls dec_ttl action with trap opcode ok 8 09d2 - Add mpls dec_ttl action with opcode and cookie ok 9 c170 - Add mpls dec_ttl action with opcode and cookie of max length ok 10 9118 - Add mpls dec_ttl action with invalid opcode ok 11 6ce1 - Add mpls dec_ttl action with label (invalid) ok 12 352f - Add mpls dec_ttl action with tc (invalid) ok 13 fa1c - Add mpls dec_ttl action with ttl (invalid) ok 14 6b79 - Add mpls dec_ttl action with bos (invalid) ok 15 d4c4 - Add mpls pop action with ip proto ok 16 91fb - Add mpls pop action with ip proto and cookie ok 17 92fe - Add mpls pop action with mpls proto ok 18 7e23 - Add mpls pop action with no protocol (invalid) ok 19 6182 - Add mpls pop action with label (invalid) ok 20 6475 - Add mpls pop action with tc (invalid) ok 21 067b - Add mpls pop action with ttl (invalid) ok 22 7316 - Add mpls pop action with bos (invalid) ok 23 38cc - Add mpls push action with label ok 24 c281 - Add mpls push action with mpls_mc protocol ok 25 5db4 - Add mpls push action with label, tc and ttl ok 26 7c34 - Add mpls push action with label, tc ttl and cookie of max length ok 27 16eb - Add mpls push action with label and bos ok 28 d69d - Add mpls push action with no label (invalid) ok 29 e8e4 - Add mpls push action with ipv4 protocol (invalid) ok 30 ecd0 - Add mpls push action with out of range label (invalid) ok 31 d303 - Add mpls push action with out of range tc (invalid) ok 32 fd6e - Add mpls push action with ttl of 0 (invalid) ok 33 19e9 - Add mpls mod action with mpls label ok 34 1fde - Add mpls mod action with max mpls label ok 35 0c50 - Add mpls mod action with mpls label exceeding max (invalid) ok 36 10b6 - Add mpls mod action with mpls label of MPLS_LABEL_IMPLNULL (invalid) ok 37 57c9 - Add mpls mod action with mpls min tc ok 38 6872 - Add mpls mod action with mpls max tc ok 39 a70a - Add mpls mod action with mpls tc exceeding max (invalid) ok 40 6ed5 - Add mpls mod action with mpls ttl ok 41 77c1 - Add mpls mod action with mpls ttl and cookie ok 42 b80f - Add mpls mod action with mpls max ttl ok 43 8864 - Add mpls mod action with mpls min ttl ok 44 6c06 - Add mpls mod action with mpls ttl of 0 (invalid) ok 45 b5d8 - Add mpls mod action with mpls ttl exceeding max (invalid) ok 46 451f - Add mpls mod action with mpls max bos ok 47 a1ed - Add mpls mod action with mpls min bos ok 48 3dcf - Add mpls mod action with mpls bos exceeding max (invalid) ok 49 db7c - Add mpls mod action with protocol (invalid) ok 50 b070 - Replace existing mpls push action with new ID ok 51 95a9 - Replace existing mpls push action with new label, tc, ttl and cookie ok 52 6cce - Delete mpls pop action ok 53 d138 - Flush mpls actions Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC") Reviewed-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/act_mpls.c | 66 +++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 6b26bdb999d7..809f7928a1be 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -190,40 +190,67 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_MPLS_PARMS]); index = parm->index; + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + exists = err; + if (exists && bind) + return 0; + + if (!exists) { + ret = tcf_idr_create(tn, index, est, a, &act_mpls_ops, bind, + true, flags); + if (ret) { + tcf_idr_cleanup(tn, index); + return ret; + } + + ret = ACT_P_CREATED; + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + /* Verify parameters against action type. */ switch (parm->m_action) { case TCA_MPLS_ACT_POP: if (!tb[TCA_MPLS_PROTO]) { NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop"); - return -EINVAL; + err = -EINVAL; + goto release_idr; } if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) { NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop"); - return -EINVAL; + err = -EINVAL; + goto release_idr; } if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop"); - return -EINVAL; + err = -EINVAL; + goto release_idr; } break; case TCA_MPLS_ACT_DEC_TTL: if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl"); - return -EINVAL; + err = -EINVAL; + goto release_idr; } break; case TCA_MPLS_ACT_PUSH: case TCA_MPLS_ACT_MAC_PUSH: if (!tb[TCA_MPLS_LABEL]) { NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push"); - return -EINVAL; + err = -EINVAL; + goto release_idr; } if (tb[TCA_MPLS_PROTO] && !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) { NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push"); - return -EPROTONOSUPPORT; + err = -EPROTONOSUPPORT; + goto release_idr; } /* Push needs a TTL - if not specified, set a default value. */ if (!tb[TCA_MPLS_TTL]) { @@ -238,33 +265,14 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, case TCA_MPLS_ACT_MODIFY: if (tb[TCA_MPLS_PROTO]) { NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify"); - return -EINVAL; + err = -EINVAL; + goto release_idr; } break; default: NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action"); - return -EINVAL; - } - - err = tcf_idr_check_alloc(tn, &index, a, bind); - if (err < 0) - return err; - exists = err; - if (exists && bind) - return 0; - - if (!exists) { - ret = tcf_idr_create(tn, index, est, a, - &act_mpls_ops, bind, true, flags); - if (ret) { - tcf_idr_cleanup(tn, index); - return ret; - } - - ret = ACT_P_CREATED; - } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { - tcf_idr_release(*a, bind); - return -EEXIST; + err = -EINVAL; + goto release_idr; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); From 4a20056a49a1854966562241922f68197f950539 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 24 Feb 2023 12:00:58 -0300 Subject: [PATCH 45/53] net/sched: act_sample: fix action bind logic The TC architecture allows filters and actions to be created independently. In filters the user can reference action objects using: tc action add action sample ... index 1 tc filter add ... action pedit index 1 In the current code for act_sample this is broken as it checks netlink attributes for create/update before actually checking if we are binding to an existing action. tdc results: 1..29 ok 1 9784 - Add valid sample action with mandatory arguments ok 2 5c91 - Add valid sample action with mandatory arguments and continue control action ok 3 334b - Add valid sample action with mandatory arguments and drop control action ok 4 da69 - Add valid sample action with mandatory arguments and reclassify control action ok 5 13ce - Add valid sample action with mandatory arguments and pipe control action ok 6 1886 - Add valid sample action with mandatory arguments and jump control action ok 7 7571 - Add sample action with invalid rate ok 8 b6d4 - Add sample action with mandatory arguments and invalid control action ok 9 a874 - Add invalid sample action without mandatory arguments ok 10 ac01 - Add invalid sample action without mandatory argument rate ok 11 4203 - Add invalid sample action without mandatory argument group ok 12 14a7 - Add invalid sample action without mandatory argument group ok 13 8f2e - Add valid sample action with trunc argument ok 14 45f8 - Add sample action with maximum rate argument ok 15 ad0c - Add sample action with maximum trunc argument ok 16 83a9 - Add sample action with maximum group argument ok 17 ed27 - Add sample action with invalid rate argument ok 18 2eae - Add sample action with invalid group argument ok 19 6ff3 - Add sample action with invalid trunc size ok 20 2b2a - Add sample action with invalid index ok 21 dee2 - Add sample action with maximum allowed index ok 22 560e - Add sample action with cookie ok 23 704a - Replace existing sample action with new rate argument ok 24 60eb - Replace existing sample action with new group argument ok 25 2cce - Replace existing sample action with new trunc argument ok 26 59d1 - Replace existing sample action with new control argument ok 27 0a6e - Replace sample action with invalid goto chain control ok 28 3872 - Delete sample action with valid index ok 29 a394 - Delete sample action with invalid index Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action") Reviewed-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/act_sample.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index f7416b5598e0..4c670e7568dc 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -55,8 +55,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, sample_policy, NULL); if (ret < 0) return ret; - if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] || - !tb[TCA_SAMPLE_PSAMPLE_GROUP]) + + if (!tb[TCA_SAMPLE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_SAMPLE_PARMS]); @@ -80,6 +80,13 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + + if (!tb[TCA_SAMPLE_RATE] || !tb[TCA_SAMPLE_PSAMPLE_GROUP]) { + NL_SET_ERR_MSG(extack, "sample rate and group are required"); + err = -EINVAL; + goto release_idr; + } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; From 0322ef49c1ac6f0e2ef37b146c0bf8440873072c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 24 Feb 2023 17:52:33 +0200 Subject: [PATCH 46/53] net: dsa: seville: ignore mscc-miim read errors from Lynx PCS During the refactoring in the commit below, vsc9953_mdio_read() was replaced with mscc_miim_read(), which has one extra step: it checks for the MSCC_MIIM_DATA_ERROR bits before returning the result. On T1040RDB, there are 8 QSGMII PCSes belonging to the switch, and they are organized in 2 groups. First group responds to MDIO addresses 4-7 because QSGMIIACR1[MDEV_PORT] is 1, and the second group responds to MDIO addresses 8-11 because QSGMIIBCR1[MDEV_PORT] is 2. I have double checked that these values are correctly set in the SERDES, as well as PCCR1[QSGMA_CFG] and PCCR1[QSGMB_CFG] are both 0b01. mscc_miim_read: phyad 8 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 8 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 8 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 8 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 9 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 9 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 9 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 9 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 10 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 10 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 10 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 10 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 11 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 11 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 11 reg 0x1 MIIM_DATA 0x2d mscc_miim_read: phyad 11 reg 0x5 MIIM_DATA 0x5801 mscc_miim_read: phyad 4 reg 0x1 MIIM_DATA 0x3002d, ERROR mscc_miim_read: phyad 4 reg 0x5 MIIM_DATA 0x3da01, ERROR mscc_miim_read: phyad 5 reg 0x1 MIIM_DATA 0x3002d, ERROR mscc_miim_read: phyad 5 reg 0x5 MIIM_DATA 0x35801, ERROR mscc_miim_read: phyad 5 reg 0x1 MIIM_DATA 0x3002d, ERROR mscc_miim_read: phyad 5 reg 0x5 MIIM_DATA 0x35801, ERROR mscc_miim_read: phyad 6 reg 0x1 MIIM_DATA 0x3002d, ERROR mscc_miim_read: phyad 6 reg 0x5 MIIM_DATA 0x35801, ERROR mscc_miim_read: phyad 6 reg 0x1 MIIM_DATA 0x3002d, ERROR mscc_miim_read: phyad 6 reg 0x5 MIIM_DATA 0x35801, ERROR mscc_miim_read: phyad 7 reg 0x1 MIIM_DATA 0x3002d, ERROR mscc_miim_read: phyad 7 reg 0x5 MIIM_DATA 0x35801, ERROR mscc_miim_read: phyad 7 reg 0x1 MIIM_DATA 0x3002d, ERROR mscc_miim_read: phyad 7 reg 0x5 MIIM_DATA 0x35801, ERROR As can be seen, the data in MIIM_DATA is still valid despite having the MSCC_MIIM_DATA_ERROR bits set. The driver as introduced in commit 84705fc16552 ("net: dsa: felix: introduce support for Seville VSC9953 switch") was ignoring these bits, perhaps deliberately (although unbeknownst to me). This is an old IP and the hardware team cannot seem to be able to help me track down a plausible reason for these failures. I'll keep investigating, but in the meantime, this is a direct regression which must be restored to a working state. The only thing I can do is keep ignoring the errors as before. Fixes: b99658452355 ("net: dsa: ocelot: felix: utilize shared mscc-miim driver for indirect MDIO access") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/seville_vsc9953.c | 4 ++-- drivers/net/mdio/mdio-mscc-miim.c | 9 ++++++--- include/linux/mdio/mdio-mscc-miim.h | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index 287b64b788db..563ad338da25 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -893,8 +893,8 @@ static int vsc9953_mdio_bus_alloc(struct ocelot *ocelot) rc = mscc_miim_setup(dev, &bus, "VSC9953 internal MDIO bus", ocelot->targets[GCB], - ocelot->map[GCB][GCB_MIIM_MII_STATUS & REG_MASK]); - + ocelot->map[GCB][GCB_MIIM_MII_STATUS & REG_MASK], + true); if (rc) { dev_err(dev, "failed to setup MDIO bus\n"); return rc; diff --git a/drivers/net/mdio/mdio-mscc-miim.c b/drivers/net/mdio/mdio-mscc-miim.c index c87e991d1a17..1a1b95ae95fa 100644 --- a/drivers/net/mdio/mdio-mscc-miim.c +++ b/drivers/net/mdio/mdio-mscc-miim.c @@ -52,6 +52,7 @@ struct mscc_miim_info { struct mscc_miim_dev { struct regmap *regs; int mii_status_offset; + bool ignore_read_errors; struct regmap *phy_regs; const struct mscc_miim_info *info; struct clk *clk; @@ -135,7 +136,7 @@ static int mscc_miim_read(struct mii_bus *bus, int mii_id, int regnum) goto out; } - if (val & MSCC_MIIM_DATA_ERROR) { + if (!miim->ignore_read_errors && !!(val & MSCC_MIIM_DATA_ERROR)) { ret = -EIO; goto out; } @@ -212,7 +213,8 @@ static const struct regmap_config mscc_miim_phy_regmap_config = { }; int mscc_miim_setup(struct device *dev, struct mii_bus **pbus, const char *name, - struct regmap *mii_regmap, int status_offset) + struct regmap *mii_regmap, int status_offset, + bool ignore_read_errors) { struct mscc_miim_dev *miim; struct mii_bus *bus; @@ -234,6 +236,7 @@ int mscc_miim_setup(struct device *dev, struct mii_bus **pbus, const char *name, miim->regs = mii_regmap; miim->mii_status_offset = status_offset; + miim->ignore_read_errors = ignore_read_errors; *pbus = bus; @@ -285,7 +288,7 @@ static int mscc_miim_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(phy_regmap), "Unable to create phy register regmap\n"); - ret = mscc_miim_setup(dev, &bus, "mscc_miim", mii_regmap, 0); + ret = mscc_miim_setup(dev, &bus, "mscc_miim", mii_regmap, 0, false); if (ret < 0) { dev_err(dev, "Unable to setup the MDIO bus\n"); return ret; diff --git a/include/linux/mdio/mdio-mscc-miim.h b/include/linux/mdio/mdio-mscc-miim.h index 5b4ed2c3cbb9..1ce699740af6 100644 --- a/include/linux/mdio/mdio-mscc-miim.h +++ b/include/linux/mdio/mdio-mscc-miim.h @@ -14,6 +14,6 @@ int mscc_miim_setup(struct device *device, struct mii_bus **bus, const char *name, struct regmap *mii_regmap, - int status_offset); + int status_offset, bool ignore_read_errors); #endif From 940af261321307cd1dd0fe8f9c34a6129f9d4bdc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 24 Feb 2023 17:52:34 +0200 Subject: [PATCH 47/53] net: dsa: felix: fix internal MDIO controller resource length The blamed commit did not properly convert the resource start/end format into the DEFINE_RES_MEM_NAMED() start/length format, resulting in a resource for vsc9959_imdio_res which is much longer than expected: $ cat /proc/iomem 1f8000000-1f815ffff : pcie@1f0000000 1f8140000-1f815ffff : 0000:00:00.5 1f8148030-1f815006f : imdio vs (correct) $ cat /proc/iomem 1f8000000-1f815ffff : pcie@1f0000000 1f8140000-1f815ffff : 0000:00:00.5 1f8148030-1f814803f : imdio Luckily it's not big enough to exceed the size of the parent resource (pci_resource_end(pdev, VSC9959_IMDIO_PCI_BAR)), and it doesn't overlap with anything else that the Linux driver uses currently, so the larger than expected size isn't a practical problem that I can see. Although it is clearly wrong in the /proc/iomem output. Fixes: 044d447a801f ("net: dsa: felix: use DEFINE_RES_MEM_NAMED for resources") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix_vsc9959.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 354aa3dbfde7..dddb28984bdf 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -554,7 +554,7 @@ static const char * const vsc9959_resource_names[TARGET_MAX] = { * SGMII/QSGMII MAC PCS can be found. */ static const struct resource vsc9959_imdio_res = - DEFINE_RES_MEM_NAMED(0x8030, 0x8040, "imdio"); + DEFINE_RES_MEM_NAMED(0x8030, 0x10, "imdio"); static const struct reg_field vsc9959_regfields[REGFIELD_MAX] = { [ANA_ADVLEARN_VLAN_CHK] = REG_FIELD(ANA_ADVLEARN, 6, 6), From ef1a99c65edb504c509a5c4aa865830867df6e7b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 24 Feb 2023 17:52:35 +0200 Subject: [PATCH 48/53] net: mscc: ocelot: fix duplicate driver name error When compiling a kernel which has both CONFIG_NET_DSA_MSCC_OCELOT_EXT and CONFIG_MSCC_OCELOT_SWITCH enabled, the following error message will be printed: [ 5.266588] Error: Driver 'ocelot-switch' is already registered, aborting... Rename the ocelot_ext.c driver to "ocelot-ext-switch" to avoid the name duplication, and update the mfd_cell entry for its resources. Fixes: 3d7316ac81ac ("net: dsa: ocelot: add external ocelot switch control") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/mfd/ocelot-core.c | 2 +- drivers/net/dsa/ocelot/ocelot_ext.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/ocelot-core.c b/drivers/mfd/ocelot-core.c index b0ff05c1759f..e1772ff00cad 100644 --- a/drivers/mfd/ocelot-core.c +++ b/drivers/mfd/ocelot-core.c @@ -177,7 +177,7 @@ static const struct mfd_cell vsc7512_devs[] = { .num_resources = ARRAY_SIZE(vsc7512_miim1_resources), .resources = vsc7512_miim1_resources, }, { - .name = "ocelot-switch", + .name = "ocelot-ext-switch", .of_compatible = "mscc,vsc7512-switch", .num_resources = ARRAY_SIZE(vsc7512_switch_resources), .resources = vsc7512_switch_resources, diff --git a/drivers/net/dsa/ocelot/ocelot_ext.c b/drivers/net/dsa/ocelot/ocelot_ext.c index 14efa6387bd7..52b41db63a28 100644 --- a/drivers/net/dsa/ocelot/ocelot_ext.c +++ b/drivers/net/dsa/ocelot/ocelot_ext.c @@ -149,7 +149,7 @@ MODULE_DEVICE_TABLE(of, ocelot_ext_switch_of_match); static struct platform_driver ocelot_ext_switch_driver = { .driver = { - .name = "ocelot-switch", + .name = "ocelot-ext-switch", .of_match_table = of_match_ptr(ocelot_ext_switch_of_match), }, .probe = ocelot_ext_probe, From 724337be7f218111cd798702e2e17bfc6615744b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 24 Feb 2023 16:44:20 +0000 Subject: [PATCH 49/53] net: dsa: ocelot_ext: remove unnecessary phylink.h include During review of ocelot_ext, it created a private phylink instance that wasn't necessary. This was removed for subsequent postings, but the include file seems to have been left behind. Remove it. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/ocelot_ext.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/ocelot_ext.c b/drivers/net/dsa/ocelot/ocelot_ext.c index 52b41db63a28..063150659816 100644 --- a/drivers/net/dsa/ocelot/ocelot_ext.c +++ b/drivers/net/dsa/ocelot/ocelot_ext.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include From 923b2e30dc9cd05931da0f64e2e23d040865c035 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 24 Feb 2023 14:56:01 -0300 Subject: [PATCH 50/53] net/sched: act_api: move TCA_EXT_WARN_MSG to the correct hierarchy TCA_EXT_WARN_MSG is currently sitting outside of the expected hierarchy for the tc actions code. It should sit within TCA_ACT_TAB. Fixes: 0349b8779cc9 ("sched: add new attr TCA_EXT_WARN_MSG to report tc extact message") Reviewed-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/act_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fce522886099..34c508675041 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1596,12 +1596,12 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; - nla_nest_end(skb, nest); - if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; + nla_nest_end(skb, nest); + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; From ccf8f7d71424ce37394c8333482853dba4f33978 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 26 Feb 2023 11:34:29 -0500 Subject: [PATCH 51/53] xen-netback: remove unused variables pending_idx and index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit building with gcc and W=1 reports drivers/net/xen-netback/netback.c:886:21: error: variable ‘pending_idx’ set but not used [-Werror=unused-but-set-variable] 886 | u16 pending_idx; | ^~~~~~~~~~~ pending_idx is not used so remove it. Since index was only used to set pending_idx, remove index as well. Signed-off-by: Tom Rix Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20230226163429.2351600-1-trix@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/xen-netback/netback.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index bf627af723bf..1b42676ca141 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -883,11 +883,9 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX]; struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; unsigned int extra_count; - u16 pending_idx; RING_IDX idx; int work_to_do; unsigned int data_len; - pending_ring_idx_t index; if (queue->tx.sring->req_prod - queue->tx.req_cons > XEN_NETIF_TX_RING_SIZE) { @@ -983,9 +981,6 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, break; } - index = pending_index(queue->pending_cons); - pending_idx = queue->pending_ring[index]; - if (ret >= XEN_NETBK_LEGACY_SLOTS_MAX - 1 && data_len < txreq.size) data_len = txreq.size; From 972074ea88402e9b1b29c3a8d3d6d1aadaff624e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sat, 25 Feb 2023 08:16:44 +0100 Subject: [PATCH 52/53] net: phy: c45: fix network interface initialization failures on xtensa, arm:cubieboard Without proper initialization, "changed" returned random numbers and caused interface initialization failures. Fixes: 022c3f87f88e ("net: phy: add genphy_c45_ethtool_get/set_eee() support") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Oleksij Rempel Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230225071644.2754893-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-c45.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 3813b86689d0..fee514b96ab1 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -672,7 +672,7 @@ EXPORT_SYMBOL_GPL(genphy_c45_read_mdix); */ int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv) { - int val, changed; + int val, changed = 0; if (linkmode_intersects(phydev->supported_eee, PHY_EEE_CAP1_FEATURES)) { val = linkmode_to_mii_eee_cap1_t(adv); From 580f98cc33a260bb8c6a39ae2921b29586b84fdf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Feb 2023 08:33:36 +0000 Subject: [PATCH 53/53] tcp: tcp_check_req() can be called from process context This is a follow up of commit 0a375c822497 ("tcp: tcp_rtx_synack() can be called from process context"). Frederick Lawler reported another "__this_cpu_add() in preemptible" warning caused by the same reason. In my former patch I took care of tcp_rtx_synack() but forgot that tcp_check_req() also contained some SNMP updates. Note that some parts of tcp_check_req() always run in BH context, I added a comment to clarify this. Fixes: 8336886f786f ("tcp: TCP Fast Open Server - support TFO listeners") Link: https://lore.kernel.org/netdev/8cd33923-a21d-397c-e46b-2a068c287b03@cloudflare.com/T/ Signed-off-by: Eric Dumazet Reported-by: Frederick Lawler Tested-by: Frederick Lawler Link: https://lore.kernel.org/r/20230227083336.4153089-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_minisocks.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e002f2e1d4f2..9a7ef7732c24 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -597,6 +597,9 @@ EXPORT_SYMBOL(tcp_create_openreq_child); * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results + * + * Note: If @fastopen is true, this can be called from process context. + * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, @@ -748,7 +751,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) - __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } @@ -767,7 +770,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; }