From 966c37f2d77eb44d47af8e919267b1ba675b2eca Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 26 Oct 2018 11:30:35 +0800 Subject: [PATCH 01/49] ipv4/igmp: fix v1/v2 switchback timeout based on rfc3376, 8.12 Similiar with ipv6 mcast commit 89225d1ce6af3 ("net: ipv6: mld: fix v1/v2 switchback timeout to rfc3810, 9.12.") i) RFC3376 8.12. Older Version Querier Present Timeout says: The Older Version Querier Interval is the time-out for transitioning a host back to IGMPv3 mode once an older version query is heard. When an older version query is received, hosts set their Older Version Querier Present Timer to Older Version Querier Interval. This value MUST be ((the Robustness Variable) times (the Query Interval in the last Query received)) plus (one Query Response Interval). Currently we only use a hardcode value IGMP_V1/v2_ROUTER_PRESENT_TIMEOUT. Fix it by adding two new items mr_qi(Query Interval) and mr_qri(Query Response Interval) in struct in_device. Now we can calculate the switchback time via (mr_qrv * mr_qi) + mr_qri. We need update these values when receive IGMPv3 queries. Reported-by: Ying Xu Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 4 ++- net/ipv4/igmp.c | 55 +++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index c759d1cbcedd..a64f21a97369 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -37,7 +37,9 @@ struct in_device { unsigned long mr_v1_seen; unsigned long mr_v2_seen; unsigned long mr_maxdelay; - unsigned char mr_qrv; + unsigned long mr_qi; /* Query Interval */ + unsigned long mr_qri; /* Query Response Interval */ + unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; unsigned char mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4da39446da2d..765b2b32c4a4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -111,13 +111,10 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) -#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) #define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) #define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) -#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 - #define IGMP_INITIAL_REPORT_DELAY (1) @@ -935,13 +932,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -981,8 +980,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ in_dev->mr_maxdelay = max_delay; - if (ih3->qrv) - in_dev->mr_qrv = ih3->qrv; + + /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently + * received value was zero, use the default or statically + * configured value. + */ + in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; + + /* RFC3376, 8.3. Query Response Interval: + * The number of seconds represented by the [Query Response + * Interval] must be less than the [Query Interval]. + */ + if (in_dev->mr_qri >= in_dev->mr_qi) + in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; + if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ @@ -1723,18 +1735,30 @@ void ip_mc_down(struct in_device *in_dev) ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } +#ifdef CONFIG_IP_MULTICAST +static void ip_mc_reset(struct in_device *in_dev) +{ + struct net *net = dev_net(in_dev->dev); + + in_dev->mr_qi = IGMP_QUERY_INTERVAL; + in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; +} +#else +static void ip_mc_reset(struct in_device *in_dev) +{ +} +#endif + void ip_mc_init_dev(struct in_device *in_dev) { -#ifdef CONFIG_IP_MULTICAST - struct net *net = dev_net(in_dev->dev); -#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif + ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } @@ -1744,15 +1768,10 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; -#ifdef CONFIG_IP_MULTICAST - struct net *net = dev_net(in_dev->dev); -#endif ASSERT_RTNL(); -#ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; -#endif + ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { From 414dd6fb9a1a1b59983aea7bf0f79f0085ecc5b8 Mon Sep 17 00:00:00 2001 From: Tobias Jungel Date: Sun, 28 Oct 2018 12:54:10 +0100 Subject: [PATCH 02/49] bonding: fix length of actor system The attribute IFLA_BOND_AD_ACTOR_SYSTEM is sent to user space having the length of sizeof(bond->params.ad_actor_system) which is 8 byte. This patch aligns the length to ETH_ALEN to have the same MAC address exposed as using sysfs. Fixes: f87fda00b6ed2 ("bonding: prevent out of bound accesses") Signed-off-by: Tobias Jungel Signed-off-by: David S. Miller --- drivers/net/bonding/bond_netlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 9697977b80f0..6b9ad8673218 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -638,8 +638,7 @@ static int bond_fill_info(struct sk_buff *skb, goto nla_put_failure; if (nla_put(skb, IFLA_BOND_AD_ACTOR_SYSTEM, - sizeof(bond->params.ad_actor_system), - &bond->params.ad_actor_system)) + ETH_ALEN, &bond->params.ad_actor_system)) goto nla_put_failure; } if (!bond_3ad_get_active_agg_info(bond, &info)) { From e2d00e62f24b907d49b83802ae91a6fa6254a48c Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Mon, 29 Oct 2018 09:30:29 +0900 Subject: [PATCH 03/49] Documentation: ip-sysctl.txt: Document tcp_fwmark_accept This patch documents the tcp_fwmark_accept sysctl that was added in 3.15. Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 163b5ff1073c..32b21571adfe 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -316,6 +316,17 @@ tcp_frto - INTEGER By default it's enabled with a non-zero value. 0 disables F-RTO. +tcp_fwmark_accept - BOOLEAN + If set, incoming connections to listening sockets that do not have a + socket mark will set the mark of the accepting socket to the fwmark of + the incoming SYN packet. This will cause all packets on that connection + (starting from the first SYNACK) to be sent with that fwmark. The + listening socket's mark is unchanged. Listening sockets that already + have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are + unaffected. + + Default: 0 + tcp_invalid_ratelimit - INTEGER Limit the maximal rate for sending duplicate acknowledgments in response to incoming TCP packets that are for an existing From 6e29464b8a72e74ec7c3f816f53bfe46a43601bc Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 29 Oct 2018 03:51:58 -0700 Subject: [PATCH 04/49] hinic: Fix l4_type parameter in hinic_task_set_tunnel_l4 Clang warns: drivers/net/ethernet/huawei/hinic/hinic_tx.c:392:34: error: implicit conversion from enumeration type 'enum hinic_l4_tunnel_type' to different enumeration type 'enum hinic_l4_offload_type' [-Werror,-Wenum-conversion] hinic_task_set_tunnel_l4(task, TUNNEL_UDP_NO_CSUM, ~~~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~~~~~~~ 1 error generated. It seems that hinic_task_set_tunnel_l4 was meant to take an enum of type hinic_l4_tunnel_type, not hinic_l4_offload_type, given both the name of the functions and the values used. Fixes: cc18a7543d2f ("net-next/hinic: add checksum offload and TSO support") Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c | 2 +- drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c index 967c993d5303..bbf9bdd0ee3e 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c @@ -532,7 +532,7 @@ void hinic_task_set_inner_l3(struct hinic_sq_task *task, } void hinic_task_set_tunnel_l4(struct hinic_sq_task *task, - enum hinic_l4_offload_type l4_type, + enum hinic_l4_tunnel_type l4_type, u32 tunnel_len) { task->pkt_info2 |= HINIC_SQ_TASK_INFO2_SET(l4_type, TUNNEL_L4TYPE) | diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h index a0dc63a4bfc7..038522e202b6 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h @@ -160,7 +160,7 @@ void hinic_task_set_inner_l3(struct hinic_sq_task *task, u32 network_len); void hinic_task_set_tunnel_l4(struct hinic_sq_task *task, - enum hinic_l4_offload_type l4_type, + enum hinic_l4_tunnel_type l4_type, u32 tunnel_len); void hinic_set_cs_inner_l4(struct hinic_sq_task *task, From ad0b9d94182be8356978d220c82f9837cffeb7a9 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 29 Oct 2018 14:26:14 +0000 Subject: [PATCH 05/49] mlxsw: spectrum_switchdev: Don't ignore deletions of learned MACs Demands to remove FDB entries should be honored even if the FDB entry in question was originally learned, and not added by the user. Therefore ignore the added_by_user datum for SWITCHDEV_FDB_DEL_TO_DEVICE. Fixes: 816a3bed9549 ("switchdev: Add fdb.added_by_user to switchdev notifications") Signed-off-by: Petr Machata Suggested-by: Ido Schimmel Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index bc60d7a8b49d..739a51f0a366 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2661,8 +2661,6 @@ static void mlxsw_sp_switchdev_bridge_fdb_event_work(struct work_struct *work) break; case SWITCHDEV_FDB_DEL_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - if (!fdb_info->added_by_user) - break; mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, false); break; case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */ From a22712a962912faf257e857ab6857f56a93cfb34 Mon Sep 17 00:00:00 2001 From: Shalom Toledo Date: Mon, 29 Oct 2018 14:26:16 +0000 Subject: [PATCH 06/49] mlxsw: core: Fix devlink unregister flow After a failed reload, the driver is still registered to devlink, its devlink instance is still allocated and the 'reload_fail' flag is set. Then, in the next reload try, the driver's allocated devlink instance will be freed without unregistering from devlink and its components (e.g, resources). This scenario can cause a use-after-free if the user tries to execute command via devlink user-space tool. Fix by not freeing the devlink instance during reload (failed or not). Fixes: 24cc68ad6c46 ("mlxsw: core: Add support for reload") Signed-off-by: Shalom Toledo Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 24 +++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 937d0ace699a..30f751e69698 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -943,8 +943,8 @@ static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink, mlxsw_core->bus, mlxsw_core->bus_priv, true, devlink); - if (err) - mlxsw_core->reload_fail = true; + mlxsw_core->reload_fail = !!err; + return err; } @@ -1083,8 +1083,15 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, { struct devlink *devlink = priv_to_devlink(mlxsw_core); - if (mlxsw_core->reload_fail) - goto reload_fail; + if (mlxsw_core->reload_fail) { + if (!reload) + /* Only the parts that were not de-initialized in the + * failed reload attempt need to be de-initialized. + */ + goto reload_fail_deinit; + else + return; + } if (mlxsw_core->driver->fini) mlxsw_core->driver->fini(mlxsw_core); @@ -1098,9 +1105,12 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, if (!reload) devlink_resources_unregister(devlink, NULL); mlxsw_core->bus->fini(mlxsw_core->bus_priv); - if (reload) - return; -reload_fail: + + return; + +reload_fail_deinit: + devlink_unregister(devlink); + devlink_resources_unregister(devlink, NULL); devlink_free(devlink); } EXPORT_SYMBOL(mlxsw_core_bus_device_unregister); From df132eff463873e14e019a07f387b4d577d6d1f9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 29 Oct 2018 23:10:29 +0800 Subject: [PATCH 07/49] sctp: clear the transport of some out_chunk_list chunks in sctp_assoc_rm_peer If a transport is removed by asconf but there still are some chunks with this transport queuing on out_chunk_list, later an use-after-free issue will be caused when accessing this transport from these chunks in sctp_outq_flush(). This is an old bug, we fix it by clearing the transport of these chunks in out_chunk_list when removing a transport in sctp_assoc_rm_peer(). Reported-by: syzbot+56a40ceee5fb35932f4d@syzkaller.appspotmail.com Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/associola.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a827a1f562bf..6a28b96e779e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -499,8 +499,9 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer) { - struct list_head *pos; - struct sctp_transport *transport; + struct sctp_transport *transport; + struct list_head *pos; + struct sctp_chunk *ch; pr_debug("%s: association:%p addr:%pISpc\n", __func__, asoc, &peer->ipaddr.sa); @@ -564,7 +565,6 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, */ if (!list_empty(&peer->transmitted)) { struct sctp_transport *active = asoc->peer.active_path; - struct sctp_chunk *ch; /* Reset the transport of each chunk on this list */ list_for_each_entry(ch, &peer->transmitted, @@ -586,6 +586,10 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, sctp_transport_hold(active); } + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) + if (ch->transport == peer) + ch->transport = NULL; + asoc->peer.transport_count--; sctp_transport_free(peer); From 713358369382cebf92f6e98ce2005f94e7344931 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 29 Oct 2018 23:13:11 +0800 Subject: [PATCH 08/49] sctp: check policy more carefully when getting pr status When getting pr_assocstatus and pr_streamstatus by sctp_getsockopt, it doesn't correctly process the case when policy is set with SCTP_PR_SCTP_ALL | SCTP_PR_SCTP_MASK. It even causes a slab-out-of-bounds in sctp_getsockopt_pr_streamstatus(). This patch fixes it by return -EINVAL for this case. Fixes: 0ac1077e3a54 ("sctp: get pr_assoc and pr_stream all status with SCTP_PR_SCTP_ALL") Reported-by: syzbot+5da0d0a72a9e7d791748@syzkaller.appspotmail.com Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index fc0386e8ff23..739f3e50120d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7083,14 +7083,15 @@ static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || + ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc) goto out; - if (policy & SCTP_PR_SCTP_ALL) { + if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { @@ -7142,7 +7143,8 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || + ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); From da71577545a52be3e0e9225a946e5fd79cfab015 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 29 Oct 2018 20:36:43 +0000 Subject: [PATCH 09/49] rtnetlink: Disallow FDB configuration for non-Ethernet device When an FDB entry is configured, the address is validated to have the length of an Ethernet address, but the device for which the address is configured can be of any type. The above can result in the use of uninitialized memory when the address is later compared against existing addresses since 'dev->addr_len' is used and it may be greater than ETH_ALEN, as with ip6tnl devices. Fix this by making sure that FDB entries are only configured for Ethernet devices. BUG: KMSAN: uninit-value in memcmp+0x11d/0x180 lib/string.c:863 CPU: 1 PID: 4318 Comm: syz-executor998 Not tainted 4.19.0-rc3+ #49 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x14b/0x190 lib/dump_stack.c:113 kmsan_report+0x183/0x2b0 mm/kmsan/kmsan.c:956 __msan_warning+0x70/0xc0 mm/kmsan/kmsan_instr.c:645 memcmp+0x11d/0x180 lib/string.c:863 dev_uc_add_excl+0x165/0x7b0 net/core/dev_addr_lists.c:464 ndo_dflt_fdb_add net/core/rtnetlink.c:3463 [inline] rtnl_fdb_add+0x1081/0x1270 net/core/rtnetlink.c:3558 rtnetlink_rcv_msg+0xa0b/0x1530 net/core/rtnetlink.c:4715 netlink_rcv_skb+0x36e/0x5f0 net/netlink/af_netlink.c:2454 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4733 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x1638/0x1720 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x1205/0x1290 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe70/0x1290 net/socket.c:2114 __sys_sendmsg net/socket.c:2152 [inline] __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg+0x2a3/0x3d0 net/socket.c:2159 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2159 do_syscall_64+0xb8/0x100 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x440ee9 Code: e8 cc ab 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 bb 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fff6a93b518 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000440ee9 RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000003 RBP: 0000000000000000 R08: 00000000004002c8 R09: 00000000004002c8 R10: 00000000004002c8 R11: 0000000000000213 R12: 000000000000b4b0 R13: 0000000000401ec0 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:256 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:181 kmsan_kmalloc+0x98/0x100 mm/kmsan/kmsan_hooks.c:91 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan_hooks.c:100 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2718 [inline] __kmalloc_node_track_caller+0x9e7/0x1160 mm/slub.c:4351 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2f5/0x9e0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:996 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1189 [inline] netlink_sendmsg+0xb49/0x1290 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe70/0x1290 net/socket.c:2114 __sys_sendmsg net/socket.c:2152 [inline] __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg+0x2a3/0x3d0 net/socket.c:2159 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2159 do_syscall_64+0xb8/0x100 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 v2: * Make error message more specific (David) Fixes: 090096bf3db1 ("net: generic fdb support for drivers without ndo_fdb_") Signed-off-by: Ido Schimmel Reported-and-tested-by: syzbot+3a288d5f5530b901310e@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+d53ab4e92a1db04110ff@syzkaller.appspotmail.com Cc: Vlad Yasevich Cc: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f679c7a7d761..e01274bd5e3e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3600,6 +3600,11 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); @@ -3704,6 +3709,11 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); From 3aa8029e1ac4faea2967e36281d93f5d099ed6a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Oct 2018 00:18:12 -0700 Subject: [PATCH 10/49] net/mlx4_en: add a missing include Abdul Haleem reported a build error on ppc : drivers/net/ethernet/mellanox/mlx4/en_rx.c:582:18: warning: `struct iphdr` declared inside parameter list [enabled by default] struct iphdr *iph) ^ drivers/net/ethernet/mellanox/mlx4/en_rx.c:582:18: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default] drivers/net/ethernet/mellanox/mlx4/en_rx.c: In function get_fixed_ipv4_csum: drivers/net/ethernet/mellanox/mlx4/en_rx.c:586:20: error: dereferencing pointer to incomplete type __u8 ipproto = iph->protocol; ^ Fixes: 55469bc6b577 ("drivers: net: remove inclusion when not needed") Signed-off-by: Eric Dumazet Reported-by: Abdul Haleem Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 5a6d0919533d..db00bf1c23f5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -43,6 +43,7 @@ #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include #endif From a6b3a3fa042343e29ffaf9169f5ba3c819d4f9a2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Oct 2018 15:41:00 +0000 Subject: [PATCH 11/49] net: mvpp2: Fix affinity hint allocation The mvpp2 driver has the curious behaviour of passing a stack variable to irq_set_affinity_hint(), which results in the kernel exploding the first time anyone accesses this information. News flash: userspace does, and irqbalance will happily take the machine down. Great stuff. An easy fix is to track the mask within the queue_vector structure, and to make sure it has the same lifetime as the interrupt itself. Fixes: e531f76757eb ("net: mvpp2: handle cases where more CPUs are available than s/w threads") Signed-off-by: Marc Zyngier Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2.h | 1 + .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h index 176c6b56fdcc..398328f10743 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h @@ -796,6 +796,7 @@ struct mvpp2_queue_vector { int nrxqs; u32 pending_cause_rx; struct mvpp2_port *port; + struct cpumask *mask; }; struct mvpp2_port { diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 14f9679c957c..7a37a37e3fb3 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3298,24 +3298,30 @@ static int mvpp2_irqs_init(struct mvpp2_port *port) for (i = 0; i < port->nqvecs; i++) { struct mvpp2_queue_vector *qv = port->qvecs + i; - if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE) + if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE) { + qv->mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!qv->mask) { + err = -ENOMEM; + goto err; + } + irq_set_status_flags(qv->irq, IRQ_NO_BALANCING); + } err = request_irq(qv->irq, mvpp2_isr, 0, port->dev->name, qv); if (err) goto err; if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE) { - unsigned long mask = 0; unsigned int cpu; for_each_present_cpu(cpu) { if (mvpp2_cpu_to_thread(port->priv, cpu) == qv->sw_thread_id) - mask |= BIT(cpu); + cpumask_set_cpu(cpu, qv->mask); } - irq_set_affinity_hint(qv->irq, to_cpumask(&mask)); + irq_set_affinity_hint(qv->irq, qv->mask); } } @@ -3325,6 +3331,8 @@ err: struct mvpp2_queue_vector *qv = port->qvecs + i; irq_set_affinity_hint(qv->irq, NULL); + kfree(qv->mask); + qv->mask = NULL; free_irq(qv->irq, qv); } @@ -3339,6 +3347,8 @@ static void mvpp2_irqs_deinit(struct mvpp2_port *port) struct mvpp2_queue_vector *qv = port->qvecs + i; irq_set_affinity_hint(qv->irq, NULL); + kfree(qv->mask); + qv->mask = NULL; irq_clear_status_flags(qv->irq, IRQ_NO_BALANCING); free_irq(qv->irq, qv); } From b31d30d9be32d41bef3e6076a965565d3a3d8005 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 29 Oct 2018 14:56:48 -0700 Subject: [PATCH 12/49] tools/bpf: add unlimited rlimit for flow_dissector_load On our test machine, bpf selftest test_flow_dissector.sh failed with the following error: # ./test_flow_dissector.sh bpffs not mounted. Mounting... libbpf: failed to create map (name: 'jmp_table'): Operation not permitted libbpf: failed to load object 'bpf_flow.o' ./flow_dissector_load: bpf_prog_load bpf_flow.o selftests: test_flow_dissector [FAILED] Let us increase the rlimit to remove the above map creation failure. Signed-off-by: Yonghong Song Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/flow_dissector_load.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c b/tools/testing/selftests/bpf/flow_dissector_load.c index d3273b5b3173..ae8180b11d5f 100644 --- a/tools/testing/selftests/bpf/flow_dissector_load.c +++ b/tools/testing/selftests/bpf/flow_dissector_load.c @@ -11,6 +11,8 @@ #include #include +#include "bpf_rlimit.h" + const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector"; const char *cfg_map_name = "jmp_table"; bool cfg_attach = true; From 27b31e68bc9fc25c519c7772fa23913687218d5f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 29 Oct 2018 12:31:28 -0700 Subject: [PATCH 13/49] bpf: tcp_bpf_recvmsg should return EAGAIN when nonblocking and no data We return 0 in the case of a nonblocking socket that has no data available. However, this is incorrect and may confuse applications. After this patch we do the correct thing and return the error EAGAIN. Quoting return codes from recvmsg manpage, EAGAIN or EWOULDBLOCK The socket is marked nonblocking and the receive operation would block, or a receive timeout had been set and the timeout expired before data was received. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/ipv4/tcp_bpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b7918d4caa30..3b45fe530f91 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -145,6 +145,7 @@ msg_bytes_ready: ret = err; goto out; } + copied = -EAGAIN; } ret = copied; out: From 094bf4d0e9657f6ea1ee3d7e07ce3970796949ce Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 12 Oct 2018 13:13:39 +0200 Subject: [PATCH 14/49] igb: shorten maximum PHC timecounter update interval The timecounter needs to be updated at least once per ~550 seconds in order to avoid a 40-bit SYSTIM timestamp to be misinterpreted as an old timestamp. Since commit 500462a9d ("timers: Switch to a non-cascading wheel"), scheduling of delayed work seems to be less accurate and a requested delay of 540 seconds may actually be longer than 550 seconds. Shorten the delay to 480 seconds to be sure the timecounter is updated in time. This fixes an issue with HW timestamps on 82580/I350/I354 being off by ~1100 seconds for few seconds every ~9 minutes. Cc: Jacob Keller Cc: Richard Cochran Cc: Thomas Gleixner Signed-off-by: Miroslav Lichvar Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_ptp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index 9f4d700e09df..29ced6b74d36 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -51,9 +51,15 @@ * * The 40 bit 82580 SYSTIM overflows every * 2^40 * 10^-9 / 60 = 18.3 minutes. + * + * SYSTIM is converted to real time using a timecounter. As + * timecounter_cyc2time() allows old timestamps, the timecounter + * needs to be updated at least once per half of the SYSTIM interval. + * Scheduling of delayed work is not very accurate, so we aim for 8 + * minutes to be sure the actual interval is shorter than 9.16 minutes. */ -#define IGB_SYSTIM_OVERFLOW_PERIOD (HZ * 60 * 9) +#define IGB_SYSTIM_OVERFLOW_PERIOD (HZ * 60 * 8) #define IGB_PTP_TX_TIMEOUT (HZ * 15) #define INCPERIOD_82576 BIT(E1000_TIMINCA_16NS_SHIFT) #define INCVALUE_82576_MASK GENMASK(E1000_TIMINCA_16NS_SHIFT - 1, 0) From e69e40c8066c0e2fafd63a5919cb9b2fb9f81f2e Mon Sep 17 00:00:00 2001 From: Ngai-Mint Kwan Date: Mon, 15 Oct 2018 12:18:27 -0700 Subject: [PATCH 15/49] fm10k: fix SM mailbox full condition Current condition will always incorrectly report a full SM mailbox if an IES API application is not running. Due to this, the "fm10k_service_task" will be infinitely queued into the driver's workqueue. This, in turn, will cause a "kworker" thread to report 100% CPU utilization and might cause "soft lockup" events or system crashes. To fix this issue, a new condition is added to determine if the SM mailbox is in the correct state of FM10K_STATE_OPEN before proceeding. In other words, an instance of the IES API must be running. If there is, the remainder of the flow stays the same which is to determine if the SM mailbox capacity has been exceeded or not and take appropriate action. Signed-off-by: Ngai-Mint Kwan Signed-off-by: Jacob Keller Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index e707d717012f..74160c2095ee 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -244,7 +244,8 @@ process_mbx: } /* guarantee we have free space in the SM mailbox */ - if (!hw->mbx.ops.tx_ready(&hw->mbx, FM10K_VFMBX_MSG_MTU)) { + if (hw->mbx.state == FM10K_STATE_OPEN && + !hw->mbx.ops.tx_ready(&hw->mbx, FM10K_VFMBX_MSG_MTU)) { /* keep track of how many times this occurs */ interface->hw_sm_mbx_full++; From e330af788998b0de4da4f5bd7ddd087507999800 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 15 Oct 2018 12:18:28 -0700 Subject: [PATCH 16/49] fm10k: ensure completer aborts are marked as non-fatal after a resume VF drivers can trigger PCIe completer aborts any time they read a queue that they don't own. Even in nominal circumstances, it is not possible to prevent the VF driver from reading queues it doesn't own. VF drivers may attempt to read queues it previously owned, but which it no longer does due to a PF reset. Normally these completer aborts aren't an issue. However, on some platforms these trigger machine check errors. This is true even if we lower their severity from fatal to non-fatal. Indeed, we already have code for lowering the severity. We could attempt to mask these errors conditionally around resets, which is the most common time they would occur. However this would essentially be a race between the PF and VF drivers, and we may still occasionally see machine check exceptions on these strictly configured platforms. Instead, mask the errors entirely any time we resume VFs. By doing so, we prevent the completer aborts from being sent to the parent PCIe device, and thus these strict platforms will not upgrade them into machine check errors. Additionally, we don't lose any information by masking these errors, because we'll still report VFs which attempt to access queues via the FUM_BAD_VF_QACCESS errors. Without this change, on platforms where completer aborts cause machine check exceptions, the VF reading queues it doesn't own could crash the host system. Masking the completer abort prevents this, so we should mask it for good, and not just around a PCIe reset. Otherwise malicious or misconfigured VFs could cause the host system to crash. Because we are masking the error entirely, there is little reason to also keep setting the severity bit, so that code is also removed. Signed-off-by: Jacob Keller Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 48 ++++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index 74160c2095ee..5d4f1761dc0c 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -303,6 +303,28 @@ void fm10k_iov_suspend(struct pci_dev *pdev) } } +static void fm10k_mask_aer_comp_abort(struct pci_dev *pdev) +{ + u32 err_mask; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); + if (!pos) + return; + + /* Mask the completion abort bit in the ERR_UNCOR_MASK register, + * preventing the device from reporting these errors to the upstream + * PCIe root device. This avoids bringing down platforms which upgrade + * non-fatal completer aborts into machine check exceptions. Completer + * aborts can occur whenever a VF reads a queue it doesn't own. + */ + pci_read_config_dword(pdev, pos + PCI_ERR_UNCOR_MASK, &err_mask); + err_mask |= PCI_ERR_UNC_COMP_ABORT; + pci_write_config_dword(pdev, pos + PCI_ERR_UNCOR_MASK, err_mask); + + mmiowb(); +} + int fm10k_iov_resume(struct pci_dev *pdev) { struct fm10k_intfc *interface = pci_get_drvdata(pdev); @@ -318,6 +340,12 @@ int fm10k_iov_resume(struct pci_dev *pdev) if (!iov_data) return -ENOMEM; + /* Lower severity of completer abort error reporting as + * the VFs can trigger this any time they read a queue + * that they don't own. + */ + fm10k_mask_aer_comp_abort(pdev); + /* allocate hardware resources for the VFs */ hw->iov.ops.assign_resources(hw, num_vfs, num_vfs); @@ -461,20 +489,6 @@ void fm10k_iov_disable(struct pci_dev *pdev) fm10k_iov_free_data(pdev); } -static void fm10k_disable_aer_comp_abort(struct pci_dev *pdev) -{ - u32 err_sev; - int pos; - - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); - if (!pos) - return; - - pci_read_config_dword(pdev, pos + PCI_ERR_UNCOR_SEVER, &err_sev); - err_sev &= ~PCI_ERR_UNC_COMP_ABORT; - pci_write_config_dword(pdev, pos + PCI_ERR_UNCOR_SEVER, err_sev); -} - int fm10k_iov_configure(struct pci_dev *pdev, int num_vfs) { int current_vfs = pci_num_vf(pdev); @@ -496,12 +510,6 @@ int fm10k_iov_configure(struct pci_dev *pdev, int num_vfs) /* allocate VFs if not already allocated */ if (num_vfs && num_vfs != current_vfs) { - /* Disable completer abort error reporting as - * the VFs can trigger this any time they read a queue - * that they don't own. - */ - fm10k_disable_aer_comp_abort(pdev); - err = pci_enable_sriov(pdev, num_vfs); if (err) { dev_err(&pdev->dev, From 9a1fe1e2bbc4194103bd6f5f8d78383d3bef41ae Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 15 Oct 2018 12:18:29 -0700 Subject: [PATCH 17/49] fm10k: add missing device IDs to the upstream driver The device IDs for the Ethernet SDI Adapter devices were never added to the upstream driver. The IDs are already in the pci.ids database, and are supported by the out-of-tree driver. Add the device IDs now, so that the upstream driver can recognize and load these devices. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 2 ++ drivers/net/ethernet/intel/fm10k/fm10k_type.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 02345d381303..e49fb51d3613 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -23,6 +23,8 @@ static const struct fm10k_info *fm10k_info_tbl[] = { */ static const struct pci_device_id fm10k_pci_tbl[] = { { PCI_VDEVICE(INTEL, FM10K_DEV_ID_PF), fm10k_device_pf }, + { PCI_VDEVICE(INTEL, FM10K_DEV_ID_SDI_FM10420_QDA2), fm10k_device_pf }, + { PCI_VDEVICE(INTEL, FM10K_DEV_ID_SDI_FM10420_DA2), fm10k_device_pf }, { PCI_VDEVICE(INTEL, FM10K_DEV_ID_VF), fm10k_device_vf }, /* required last entry */ { 0, } diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_type.h b/drivers/net/ethernet/intel/fm10k/fm10k_type.h index 3e608e493f9d..9fb9fca375e3 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_type.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k_type.h @@ -15,6 +15,8 @@ struct fm10k_hw; #define FM10K_DEV_ID_PF 0x15A4 #define FM10K_DEV_ID_VF 0x15A5 +#define FM10K_DEV_ID_SDI_FM10420_QDA2 0x15D0 +#define FM10K_DEV_ID_SDI_FM10420_DA2 0x15D5 #define FM10K_MAX_QUEUES 256 #define FM10K_MAX_QUEUES_PF 128 From 35ae5414e7085dfabe3d1737569eff549b04942e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 15 Oct 2018 12:21:28 -0700 Subject: [PATCH 18/49] fm10k: bump driver version to match out-of-tree release The upstream and out-of-tree drivers are once again at comparable functionality. It's been a while since we updated the upstream driver version, so bump it now. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 503bbc017792..5b2a50e5798f 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -11,7 +11,7 @@ #include "fm10k.h" -#define DRV_VERSION "0.23.4-k" +#define DRV_VERSION "0.26.1-k" #define DRV_SUMMARY "Intel(R) Ethernet Switch Host Interface Driver" const char fm10k_driver_version[] = DRV_VERSION; char fm10k_driver_name[] = "fm10k"; From 48e01e001da31d5a40e31ed5f8cea83a18823cc1 Mon Sep 17 00:00:00 2001 From: Jeff Kirsher Date: Thu, 18 Oct 2018 15:39:43 -0700 Subject: [PATCH 19/49] ixgbe/ixgbevf: fix XFRM_ALGO dependency Based on the original work from Arnd Bergmann. When XFRM_ALGO is not enabled, the new ixgbe IPsec code produces a link error: drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.o: In function `ixgbe_ipsec_vf_add_sa': ixgbe_ipsec.c:(.text+0x1266): undefined reference to `xfrm_aead_get_byname' Simply selecting XFRM_ALGO from here causes circular dependencies, so to fix it, we probably want this slightly more complex solution that is similar to what other drivers with XFRM offload do: A separate Kconfig symbol now controls whether we include the IPsec offload code. To keep the old behavior, this is left as 'default y'. The dependency in XFRM_OFFLOAD still causes a circular dependency but is not actually needed because this symbol is not user visible, so removing that dependency on top makes it all work. CC: Arnd Bergmann CC: Shannon Nelson Fixes: eda0333ac293 ("ixgbe: add VF IPsec management") Signed-off-by: Jeff Kirsher Tested-by: Andrew Bowers --- drivers/net/ethernet/intel/Kconfig | 18 ++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/Makefile | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 8 ++++---- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 +++--- drivers/net/ethernet/intel/ixgbevf/Makefile | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 4 ++-- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- net/xfrm/Kconfig | 1 - 8 files changed, 30 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index fd3373d82a9e..59e1bc0f609e 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -200,6 +200,15 @@ config IXGBE_DCB If unsure, say N. +config IXGBE_IPSEC + bool "IPSec XFRM cryptography-offload acceleration" + depends on IXGBE + depends on XFRM_OFFLOAD + default y + select XFRM_ALGO + ---help--- + Enable support for IPSec offload in ixgbe.ko + config IXGBEVF tristate "Intel(R) 10GbE PCI Express Virtual Function Ethernet support" depends on PCI_MSI @@ -217,6 +226,15 @@ config IXGBEVF will be called ixgbevf. MSI-X interrupt support is required for this driver to work correctly. +config IXGBEVF_IPSEC + bool "IPSec XFRM cryptography-offload acceleration" + depends on IXGBEVF + depends on XFRM_OFFLOAD + default y + select XFRM_ALGO + ---help--- + Enable support for IPSec offload in ixgbevf.ko + config I40E tristate "Intel(R) Ethernet Controller XL710 Family support" imply PTP_1588_CLOCK diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile index ca6b0c458e4a..4fb0d9e3f2da 100644 --- a/drivers/net/ethernet/intel/ixgbe/Makefile +++ b/drivers/net/ethernet/intel/ixgbe/Makefile @@ -17,4 +17,4 @@ ixgbe-$(CONFIG_IXGBE_DCB) += ixgbe_dcb.o ixgbe_dcb_82598.o \ ixgbe-$(CONFIG_IXGBE_HWMON) += ixgbe_sysfs.o ixgbe-$(CONFIG_DEBUG_FS) += ixgbe_debugfs.o ixgbe-$(CONFIG_FCOE:m=y) += ixgbe_fcoe.o -ixgbe-$(CONFIG_XFRM_OFFLOAD) += ixgbe_ipsec.o +ixgbe-$(CONFIG_IXGBE_IPSEC) += ixgbe_ipsec.o diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index ec1b87cc4410..143bdd5ee2a0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -769,9 +769,9 @@ struct ixgbe_adapter { #define IXGBE_RSS_KEY_SIZE 40 /* size of RSS Hash Key in bytes */ u32 *rss_key; -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_IXGBE_IPSEC struct ixgbe_ipsec *ipsec; -#endif /* CONFIG_XFRM_OFFLOAD */ +#endif /* CONFIG_IXGBE_IPSEC */ /* AF_XDP zero-copy */ struct xdp_umem **xsk_umems; @@ -1008,7 +1008,7 @@ void ixgbe_store_key(struct ixgbe_adapter *adapter); void ixgbe_store_reta(struct ixgbe_adapter *adapter); s32 ixgbe_negotiate_fc(struct ixgbe_hw *hw, u32 adv_reg, u32 lp_reg, u32 adv_sym, u32 adv_asm, u32 lp_sym, u32 lp_asm); -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_IXGBE_IPSEC void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter); void ixgbe_stop_ipsec_offload(struct ixgbe_adapter *adapter); void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter); @@ -1036,5 +1036,5 @@ static inline int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *mbuf, u32 vf) { return -EACCES; } static inline int ixgbe_ipsec_vf_del_sa(struct ixgbe_adapter *adapter, u32 *mbuf, u32 vf) { return -EACCES; } -#endif /* CONFIG_XFRM_OFFLOAD */ +#endif /* CONFIG_IXGBE_IPSEC */ #endif /* _IXGBE_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 0049a2becd7e..113b38e0defb 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8694,7 +8694,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, #endif /* IXGBE_FCOE */ -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_IXGBE_IPSEC if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, &ipsec_tx)) goto out_drop; #endif @@ -10190,7 +10190,7 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev, * the TSO, so it's the exception. */ if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID)) { -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_IXGBE_IPSEC if (!skb->sp) #endif features &= ~NETIF_F_TSO; @@ -10883,7 +10883,7 @@ skip_sriov: if (hw->mac.type >= ixgbe_mac_82599EB) netdev->features |= NETIF_F_SCTP_CRC; -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_IXGBE_IPSEC #define IXGBE_ESP_FEATURES (NETIF_F_HW_ESP | \ NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) diff --git a/drivers/net/ethernet/intel/ixgbevf/Makefile b/drivers/net/ethernet/intel/ixgbevf/Makefile index 297d0f0858b5..186a4bb24fde 100644 --- a/drivers/net/ethernet/intel/ixgbevf/Makefile +++ b/drivers/net/ethernet/intel/ixgbevf/Makefile @@ -10,5 +10,5 @@ ixgbevf-objs := vf.o \ mbx.o \ ethtool.o \ ixgbevf_main.o -ixgbevf-$(CONFIG_XFRM_OFFLOAD) += ipsec.o +ixgbevf-$(CONFIG_IXGBEVF_IPSEC) += ipsec.o diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index e399e1c0c54a..ecab686574b6 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -459,7 +459,7 @@ int ethtool_ioctl(struct ifreq *ifr); extern void ixgbevf_write_eitr(struct ixgbevf_q_vector *q_vector); -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_IXGBEVF_IPSEC void ixgbevf_init_ipsec_offload(struct ixgbevf_adapter *adapter); void ixgbevf_stop_ipsec_offload(struct ixgbevf_adapter *adapter); void ixgbevf_ipsec_restore(struct ixgbevf_adapter *adapter); @@ -482,7 +482,7 @@ static inline int ixgbevf_ipsec_tx(struct ixgbevf_ring *tx_ring, struct ixgbevf_tx_buffer *first, struct ixgbevf_ipsec_tx_data *itd) { return 0; } -#endif /* CONFIG_XFRM_OFFLOAD */ +#endif /* CONFIG_IXGBEVF_IPSEC */ void ixgbe_napi_add_all(struct ixgbevf_adapter *adapter); void ixgbe_napi_del_all(struct ixgbevf_adapter *adapter); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 98707ee11d72..5e47ede7e832 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4150,7 +4150,7 @@ static int ixgbevf_xmit_frame_ring(struct sk_buff *skb, first->tx_flags = tx_flags; first->protocol = vlan_get_protocol(skb); -#ifdef CONFIG_XFRM_OFFLOAD +#ifdef CONFIG_IXGBEVF_IPSEC if (skb->sp && !ixgbevf_ipsec_tx(tx_ring, first, &ipsec_tx)) goto out_drop; #endif diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 4a9ee2d83158..140270a13d54 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -8,7 +8,6 @@ config XFRM config XFRM_OFFLOAD bool - depends on XFRM config XFRM_ALGO tristate From bb58fd7eeffc9bd5d6e2a16cbf0e9e259f8d09f2 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Fri, 19 Oct 2018 14:11:03 -0700 Subject: [PATCH 20/49] i40e: Update status codes Add a few new status code which will be used by the ice driver, and rename a few to make them more consistent. Error code are mapped to similar values as in i40e_status.h, so as to be compatible with older VF drivers not using this status enum. Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 +- include/linux/avf/virtchnl.h | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 81b0e1f8d14b..ac5698ed0b11 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3674,7 +3674,7 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode, dev_err(&pf->pdev->dev, "Invalid message from VF %d, opcode %d, len %d\n", local_vf_id, v_opcode, msglen); switch (ret) { - case VIRTCHNL_ERR_PARAM: + case VIRTCHNL_STATUS_ERR_PARAM: return -EPERM; default: return -EINVAL; diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2c9756bd9c4c..b2488055fd1d 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -62,13 +62,19 @@ /* Error Codes */ enum virtchnl_status_code { VIRTCHNL_STATUS_SUCCESS = 0, - VIRTCHNL_ERR_PARAM = -5, + VIRTCHNL_STATUS_ERR_PARAM = -5, + VIRTCHNL_STATUS_ERR_NO_MEMORY = -18, VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH = -38, VIRTCHNL_STATUS_ERR_CQP_COMPL_ERROR = -39, VIRTCHNL_STATUS_ERR_INVALID_VF_ID = -40, - VIRTCHNL_STATUS_NOT_SUPPORTED = -64, + VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR = -53, + VIRTCHNL_STATUS_ERR_NOT_SUPPORTED = -64, }; +/* Backward compatibility */ +#define VIRTCHNL_ERR_PARAM VIRTCHNL_STATUS_ERR_PARAM +#define VIRTCHNL_STATUS_NOT_SUPPORTED VIRTCHNL_STATUS_ERR_NOT_SUPPORTED + #define VIRTCHNL_LINK_SPEED_100MB_SHIFT 0x1 #define VIRTCHNL_LINK_SPEED_1000MB_SHIFT 0x2 #define VIRTCHNL_LINK_SPEED_10GB_SHIFT 0x3 @@ -831,7 +837,7 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: default: - return VIRTCHNL_ERR_PARAM; + return VIRTCHNL_STATUS_ERR_PARAM; } /* few more checks */ if (err_msg_format || valid_len != msglen) From 6702185c1ffec3421181b5e24491e3fac920cb61 Mon Sep 17 00:00:00 2001 From: Radoslaw Tyl Date: Mon, 22 Oct 2018 08:44:31 +0200 Subject: [PATCH 21/49] ixgbe: fix MAC anti-spoofing filter after VFLR This change resolves a driver bug where the driver is logging a message that says "Spoofed packets detected". This can occur on the PF (host) when a VF has VLAN+MACVLAN enabled and is re-started with a different MAC address. MAC and VLAN anti-spoofing filters are to be enabled together. Signed-off-by: Radoslaw Tyl Tested-by: Andrew Bowers Acked-by: Piotr Skajewski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index af25a8fffeb8..5dacfc870259 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -722,8 +722,10 @@ static inline void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf) ixgbe_set_vmvir(adapter, vfinfo->pf_vlan, adapter->default_up, vf); - if (vfinfo->spoofchk_enabled) + if (vfinfo->spoofchk_enabled) { hw->mac.ops.set_vlan_anti_spoofing(hw, true, vf); + hw->mac.ops.set_mac_anti_spoofing(hw, true, vf); + } } /* reset multicast table array for vf */ From e7611088f0357339d8c30540222debfa24095d4b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 29 Oct 2018 22:46:11 +0000 Subject: [PATCH 22/49] net: hns3: fix spelling mistake "intrerrupt" -> "interrupt" Trivial fix to spelling mistake in dev_err message Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index dca6f2326c26..123c37e653f3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -751,7 +751,7 @@ static void hclge_process_ncsi_error(struct hclge_dev *hdev, ret = hclge_cmd_clear_error(hdev, &desc_wr, &desc_rd, HCLGE_NCSI_INT_CLR, 0); if (ret) - dev_err(dev, "failed(=%d) to clear NCSI intrerrupt status\n", + dev_err(dev, "failed(=%d) to clear NCSI interrupt status\n", ret); } From c4147beabec19fc7b37eb79251114bad3e9915dd Mon Sep 17 00:00:00 2001 From: Bo YU Date: Mon, 29 Oct 2018 23:42:09 -0400 Subject: [PATCH 23/49] net: add an identifier name for 'struct sock *' Fix a warning from checkpatch: function definition argument 'struct sock *' should also have an identifier name in include/net/af_unix.h. Signed-off-by: Bo YU Signed-off-by: David S. Miller --- include/net/af_unix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index e2695c4bf358..d53aea859a76 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -13,7 +13,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp); void unix_gc(void); void wait_for_unix_gc(void); struct sock *unix_get_socket(struct file *filp); -struct sock *unix_peer_get(struct sock *); +struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_SIZE 256 #define UNIX_HASH_BITS 8 From b1c234441e07da748ccded3aaa37177622d469d3 Mon Sep 17 00:00:00 2001 From: Bo YU Date: Mon, 29 Oct 2018 23:42:10 -0400 Subject: [PATCH 24/49] net: drop a space before tabs Fix a warning from checkpatch.pl:'please no space before tabs' in include/net/af_unix.h Signed-off-by: Bo YU Signed-off-by: David S. Miller --- include/net/af_unix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index d53aea859a76..ddbba838d048 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -40,7 +40,7 @@ struct unix_skb_parms { u32 consumed; } __randomize_layout; -#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) +#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) #define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) From ff002269a4ee9c769dbf9365acef633ebcbd6cbe Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Oct 2018 14:10:49 +0800 Subject: [PATCH 25/49] vhost: Fix Spectre V1 vulnerability The idx in vhost_vring_ioctl() was controlled by userspace, hence a potential exploitation of the Spectre variant 1 vulnerability. Fixing this by sanitizing idx before using it to index d->vqs. Cc: Michael S. Tsirkin Cc: Josh Poimboeuf Cc: Andrea Arcangeli Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f52008bb8df7..3a5f81a66d34 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "vhost.h" @@ -1387,6 +1388,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg if (idx >= d->nvqs) return -ENOBUFS; + idx = array_index_nospec(idx, d->nvqs); vq = d->vqs[idx]; mutex_lock(&vq->mutex); From d48051c5b8376038c2b287c3b1bd55b8d391d567 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Oct 2018 00:57:25 -0700 Subject: [PATCH 26/49] net/mlx5e: fix csum adjustments caused by RXFCS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As shown by Dmitris, we need to use csum_block_add() instead of csum_add() when adding the FCS contribution to skb csum. Before 4.18 (more exactly commit 88078d98d1bb "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"), the whole skb csum was thrown away, so RXFCS changes were ignored. Then before commit d55bef5059dd ("net: fix pskb_trim_rcsum_slow() with odd trim offset") both mlx5 and pskb_trim_rcsum_slow() bugs were canceling each other. Now we fixed pskb_trim_rcsum_slow() we need to fix mlx5. Note that this patch also rewrites mlx5e_get_fcs() to : - Use skb_header_pointer() instead of reinventing it. - Use __get_unaligned_cpu32() to avoid possible non aligned accesses as Dmitris pointed out. Fixes: 902a545904c7 ("net/mlx5e: When RXFCS is set, add FCS data into checksum calculation") Reported-by: Paweł Staszewski Signed-off-by: Eric Dumazet Cc: Eran Ben Elisha Cc: Saeed Mahameed Cc: Dimitris Michailidis Cc: Cong Wang Cc: Paweł Staszewski Reviewed-by: Eran Ben Elisha Tested-By: Maria Pasechnik Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 45 ++++--------------- 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 94224c22ecc3..79638dcbae78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -713,43 +713,15 @@ static inline void mlx5e_enable_ecn(struct mlx5e_rq *rq, struct sk_buff *skb) rq->stats->ecn_mark += !!rc; } -static __be32 mlx5e_get_fcs(struct sk_buff *skb) +static u32 mlx5e_get_fcs(const struct sk_buff *skb) { - int last_frag_sz, bytes_in_prev, nr_frags; - u8 *fcs_p1, *fcs_p2; - skb_frag_t *last_frag; - __be32 fcs_bytes; + const void *fcs_bytes; + u32 _fcs_bytes; - if (!skb_is_nonlinear(skb)) - return *(__be32 *)(skb->data + skb->len - ETH_FCS_LEN); + fcs_bytes = skb_header_pointer(skb, skb->len - ETH_FCS_LEN, + ETH_FCS_LEN, &_fcs_bytes); - nr_frags = skb_shinfo(skb)->nr_frags; - last_frag = &skb_shinfo(skb)->frags[nr_frags - 1]; - last_frag_sz = skb_frag_size(last_frag); - - /* If all FCS data is in last frag */ - if (last_frag_sz >= ETH_FCS_LEN) - return *(__be32 *)(skb_frag_address(last_frag) + - last_frag_sz - ETH_FCS_LEN); - - fcs_p2 = (u8 *)skb_frag_address(last_frag); - bytes_in_prev = ETH_FCS_LEN - last_frag_sz; - - /* Find where the other part of the FCS is - Linear or another frag */ - if (nr_frags == 1) { - fcs_p1 = skb_tail_pointer(skb); - } else { - skb_frag_t *prev_frag = &skb_shinfo(skb)->frags[nr_frags - 2]; - - fcs_p1 = skb_frag_address(prev_frag) + - skb_frag_size(prev_frag); - } - fcs_p1 -= bytes_in_prev; - - memcpy(&fcs_bytes, fcs_p1, bytes_in_prev); - memcpy(((u8 *)&fcs_bytes) + bytes_in_prev, fcs_p2, last_frag_sz); - - return fcs_bytes; + return __get_unaligned_cpu32(fcs_bytes); } static u8 get_ip_proto(struct sk_buff *skb, __be16 proto) @@ -797,8 +769,9 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, network_depth - ETH_HLEN, skb->csum); if (unlikely(netdev->features & NETIF_F_RXFCS)) - skb->csum = csum_add(skb->csum, - (__force __wsum)mlx5e_get_fcs(skb)); + skb->csum = csum_block_add(skb->csum, + (__force __wsum)mlx5e_get_fcs(skb), + skb->len - ETH_FCS_LEN); stats->csum_complete++; return; } From ece4bf46e98c9f3775a488f3932a531508d3b1a2 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:43 +0800 Subject: [PATCH 27/49] net: hns3: add error handler for hns3_nic_init_vector_data() When hns3_nic_init_vector_data() fails to map ring to vector, it should cancel the netif_napi_add() that has been successfully done and then exits. Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 32f3aca814e7..0b4323b1f930 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2821,7 +2821,7 @@ static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) struct hnae3_handle *h = priv->ae_handle; struct hns3_enet_tqp_vector *tqp_vector; int ret = 0; - u16 i; + int i; hns3_nic_set_cpumask(priv); @@ -2868,13 +2868,19 @@ static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) hns3_free_vector_ring_chain(tqp_vector, &vector_ring_chain); if (ret) - return ret; + goto map_ring_fail; netif_napi_add(priv->netdev, &tqp_vector->napi, hns3_nic_common_poll, NAPI_POLL_WEIGHT); } return 0; + +map_ring_fail: + while (i--) + netif_napi_del(&priv->tqp_vector[i].napi); + + return ret; } static int hns3_nic_alloc_vector_data(struct hns3_nic_priv *priv) From 73b907a083b8a8c1c62cb494bc9fbe6ae086c460 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:44 +0800 Subject: [PATCH 28/49] net: hns3: bugfix for buffer not free problem during resetting When hns3_get_ring_config()/hns3_queue_to_ring()/ hns3_get_vector_ring_chain() failed during resetting, the allocated memory has not been freed before these three functions return. So this patch adds error handler in these functions to fix it. Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 0b4323b1f930..b767ff96b537 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2727,7 +2727,7 @@ static int hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, chain = devm_kzalloc(&pdev->dev, sizeof(*chain), GFP_KERNEL); if (!chain) - return -ENOMEM; + goto err_free_chain; cur_chain->next = chain; chain->tqp_index = tx_ring->tqp->tqp_index; @@ -2757,7 +2757,7 @@ static int hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, while (rx_ring) { chain = devm_kzalloc(&pdev->dev, sizeof(*chain), GFP_KERNEL); if (!chain) - return -ENOMEM; + goto err_free_chain; cur_chain->next = chain; chain->tqp_index = rx_ring->tqp->tqp_index; @@ -2772,6 +2772,16 @@ static int hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, } return 0; + +err_free_chain: + cur_chain = head->next; + while (cur_chain) { + chain = cur_chain->next; + devm_kfree(&pdev->dev, chain); + cur_chain = chain; + } + + return -ENOMEM; } static void hns3_free_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, @@ -3037,8 +3047,10 @@ static int hns3_queue_to_ring(struct hnae3_queue *tqp, return ret; ret = hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_RX); - if (ret) + if (ret) { + devm_kfree(priv->dev, priv->ring_data[tqp->tqp_index].ring); return ret; + } return 0; } @@ -3065,6 +3077,12 @@ static int hns3_get_ring_config(struct hns3_nic_priv *priv) return 0; err: + while (i--) { + devm_kfree(priv->dev, priv->ring_data[i].ring); + devm_kfree(priv->dev, + priv->ring_data[i + h->kinfo.num_tqps].ring); + } + devm_kfree(&pdev->dev, priv->ring_data); return ret; } From 0d4411408a7fb9aad0645f23911d9bfdd2ce3177 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:45 +0800 Subject: [PATCH 29/49] net: hns3: bugfix for reporting unknown vector0 interrupt repeatly problem The current driver supports handling two vector0 interrupts, reset and mailbox. When the hardware reports an interrupt of another type of interrupt source, if the driver does not process the interrupt, but enables the interrupt, the hardware will repeatedly report the unknown interrupt. Therefore, the driver enables the vector0 interrupt after clearing the known type of interrupt source. Other conditions are not enabled. Fixes: cd8c5c269b1d ("net: hns3: Fix for hclge_reset running repeatly problem") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 5234b5373ed3..2a6314784c8a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2236,7 +2236,7 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data) } /* clear the source of interrupt if it is not cause by reset */ - if (event_cause != HCLGE_VECTOR0_EVENT_RST) { + if (event_cause == HCLGE_VECTOR0_EVENT_MBX) { hclge_clear_event_cause(hdev, event_cause, clearval); hclge_enable_vector(&hdev->misc_vector, true); } From b2f74dbaf12bf59ff35d451005b3cdee78232ff0 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:46 +0800 Subject: [PATCH 30/49] net: hns3: bugfix for the initialization of command queue's spin lock The spin lock of the command queue only need to be initialized once when the driver initializes the command queue. It is not necessary to initialize the spin lock when resetting. At the same time, the modification of the queue member should be performed after acquiring the lock. Fixes: 3efb960f056d ("net: hns3: Refactor the initialization of command queue") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index ac13cb2b168e..68026a5ad7e7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -304,6 +304,10 @@ int hclge_cmd_queue_init(struct hclge_dev *hdev) { int ret; + /* Setup the lock for command queue */ + spin_lock_init(&hdev->hw.cmq.csq.lock); + spin_lock_init(&hdev->hw.cmq.crq.lock); + /* Setup the queue entries for use cmd queue */ hdev->hw.cmq.csq.desc_num = HCLGE_NIC_CMQ_DESC_NUM; hdev->hw.cmq.crq.desc_num = HCLGE_NIC_CMQ_DESC_NUM; @@ -337,18 +341,20 @@ int hclge_cmd_init(struct hclge_dev *hdev) u32 version; int ret; + spin_lock_bh(&hdev->hw.cmq.csq.lock); + spin_lock_bh(&hdev->hw.cmq.crq.lock); + hdev->hw.cmq.csq.next_to_clean = 0; hdev->hw.cmq.csq.next_to_use = 0; hdev->hw.cmq.crq.next_to_clean = 0; hdev->hw.cmq.crq.next_to_use = 0; - /* Setup the lock for command queue */ - spin_lock_init(&hdev->hw.cmq.csq.lock); - spin_lock_init(&hdev->hw.cmq.crq.lock); - hclge_cmd_init_regs(&hdev->hw); clear_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state); + spin_unlock_bh(&hdev->hw.cmq.crq.lock); + spin_unlock_bh(&hdev->hw.cmq.csq.lock); + ret = hclge_cmd_query_firmware_version(&hdev->hw, &version); if (ret) { dev_err(&hdev->pdev->dev, From 5faaf0752a0c43735b7c508dcf3c4c7b36a032db Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:47 +0800 Subject: [PATCH 31/49] net: hns3: remove unnecessary queue reset in the hns3_uninit_all_ring() It is not necessary to reset the queue in the hns3_uninit_all_ring(), since the queue is stopped in the down operation, and will be reset in the up operation. And the judgment of the HCLGE_STATE_RST_HANDLING flag in the hclge_reset_tqp() is not correct, because we need to reset tqp during pf reset, otherwise it may cause queue not being reset to working state problem. Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 --- 2 files changed, 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b767ff96b537..bf71c23be409 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3250,9 +3250,6 @@ int hns3_uninit_all_ring(struct hns3_nic_priv *priv) int i; for (i = 0; i < h->kinfo.num_tqps; i++) { - if (h->ae_algo->ops->reset_queue) - h->ae_algo->ops->reset_queue(h, i); - hns3_fini_ring(priv->ring_data[i].ring); hns3_fini_ring(priv->ring_data[i + h->kinfo.num_tqps].ring); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 2a6314784c8a..4dd050688549 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6116,9 +6116,6 @@ void hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id) u16 queue_gid; int ret; - if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) - return; - queue_gid = hclge_covert_handle_qid_global(handle, queue_id); ret = hclge_tqp_enable(hdev, queue_id, 0, false); From 6d71ec6cbf74ac9c2823ef751b1baa5b889bb3ac Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:48 +0800 Subject: [PATCH 32/49] net: hns3: bugfix for is_valid_csq_clean_head() The HEAD pointer of the hardware command queue maybe equal to the command queue's next_to_use in the driver, so that does not belong to the invalid HEAD pointer, since the hardware may not process the command in time, causing the HEAD pointer to be too late to update. The variables' name in this function is unreadable, so give them a more readable one. Fixes: 3ff504908f95 ("net: hns3: fix a dead loop in hclge_cmd_csq_clean") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 68026a5ad7e7..690f62ed87dc 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -24,15 +24,15 @@ static int hclge_ring_space(struct hclge_cmq_ring *ring) return ring->desc_num - used - 1; } -static int is_valid_csq_clean_head(struct hclge_cmq_ring *ring, int h) +static int is_valid_csq_clean_head(struct hclge_cmq_ring *ring, int head) { - int u = ring->next_to_use; - int c = ring->next_to_clean; + int ntu = ring->next_to_use; + int ntc = ring->next_to_clean; - if (unlikely(h >= ring->desc_num)) - return 0; + if (ntu > ntc) + return head >= ntc && head <= ntu; - return u > c ? (h > c && h <= u) : (h > c || h <= u); + return head >= ntc || head <= ntu; } static int hclge_alloc_cmd_desc(struct hclge_cmq_ring *ring) From 1c12493809924deda6c0834cb2f2c5a6dc786390 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:49 +0800 Subject: [PATCH 33/49] net: hns3: bugfix for hclge_mdio_write and hclge_mdio_read When there is a PHY, the driver needs to complete some operations through MDIO during reset reinitialization, so HCLGE_STATE_CMD_DISABLE is more suitable than HCLGE_STATE_RST_HANDLING to prevent the MDIO operation from being sent during the hardware reset. Fixes: b50ae26c57cb ("net: hns3: never send command queue message to IMP when reset) Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c index 24b1f2a0c32a..03018638f701 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c @@ -52,7 +52,7 @@ static int hclge_mdio_write(struct mii_bus *bus, int phyid, int regnum, struct hclge_desc desc; int ret; - if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) + if (test_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state)) return 0; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MDIO_CONFIG, false); @@ -90,7 +90,7 @@ static int hclge_mdio_read(struct mii_bus *bus, int phyid, int regnum) struct hclge_desc desc; int ret; - if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) + if (test_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state)) return 0; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MDIO_CONFIG, true); From 7fa6be4fd2f60399f3f3370a87629e102407a724 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:50 +0800 Subject: [PATCH 34/49] net: hns3: fix incorrect return value/type of some functions There are some functions that, when they fail to send the command, need to return the corresponding error value to its caller. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Fixes: 681ec3999b3d ("net: hns3: fix for vlan table lost problem when resetting") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 6 +- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 80 +++++++++++++------ .../net/ethernet/hisilicon/hns3/hns3_enet.h | 2 +- .../hisilicon/hns3/hns3pf/hclge_main.c | 34 ++++---- .../hisilicon/hns3/hns3pf/hclge_main.h | 2 +- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 14 ++-- 6 files changed, 85 insertions(+), 53 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index e82e4ca20620..055b40606dbc 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -316,8 +316,8 @@ struct hnae3_ae_ops { int (*set_loopback)(struct hnae3_handle *handle, enum hnae3_loop loop_mode, bool en); - void (*set_promisc_mode)(struct hnae3_handle *handle, bool en_uc_pmc, - bool en_mc_pmc); + int (*set_promisc_mode)(struct hnae3_handle *handle, bool en_uc_pmc, + bool en_mc_pmc); int (*set_mtu)(struct hnae3_handle *handle, int new_mtu); void (*get_pauseparam)(struct hnae3_handle *handle, @@ -391,7 +391,7 @@ struct hnae3_ae_ops { int vector_num, struct hnae3_ring_chain_node *vr_chain); - void (*reset_queue)(struct hnae3_handle *handle, u16 queue_id); + int (*reset_queue)(struct hnae3_handle *handle, u16 queue_id); u32 (*get_fw_version)(struct hnae3_handle *handle); void (*get_mdix_mode)(struct hnae3_handle *handle, u8 *tp_mdix_ctrl, u8 *tp_mdix); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index bf71c23be409..3f96aa30068e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -509,16 +509,18 @@ static void hns3_nic_set_rx_mode(struct net_device *netdev) h->netdev_flags = new_flags; } -void hns3_update_promisc_mode(struct net_device *netdev, u8 promisc_flags) +int hns3_update_promisc_mode(struct net_device *netdev, u8 promisc_flags) { struct hns3_nic_priv *priv = netdev_priv(netdev); struct hnae3_handle *h = priv->ae_handle; if (h->ae_algo->ops->set_promisc_mode) { - h->ae_algo->ops->set_promisc_mode(h, - promisc_flags & HNAE3_UPE, - promisc_flags & HNAE3_MPE); + return h->ae_algo->ops->set_promisc_mode(h, + promisc_flags & HNAE3_UPE, + promisc_flags & HNAE3_MPE); } + + return 0; } void hns3_enable_vlan_filter(struct net_device *netdev, bool enable) @@ -1494,18 +1496,22 @@ static int hns3_vlan_rx_kill_vid(struct net_device *netdev, return ret; } -static void hns3_restore_vlan(struct net_device *netdev) +static int hns3_restore_vlan(struct net_device *netdev) { struct hns3_nic_priv *priv = netdev_priv(netdev); + int ret = 0; u16 vid; - int ret; for_each_set_bit(vid, priv->active_vlans, VLAN_N_VID) { ret = hns3_vlan_rx_add_vid(netdev, htons(ETH_P_8021Q), vid); - if (ret) - netdev_warn(netdev, "Restore vlan: %d filter, ret:%d\n", - vid, ret); + if (ret) { + netdev_err(netdev, "Restore vlan: %d filter, ret:%d\n", + vid, ret); + return ret; + } } + + return ret; } static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, @@ -3257,11 +3263,12 @@ int hns3_uninit_all_ring(struct hns3_nic_priv *priv) } /* Set mac addr if it is configured. or leave it to the AE driver */ -static void hns3_init_mac_addr(struct net_device *netdev, bool init) +static int hns3_init_mac_addr(struct net_device *netdev, bool init) { struct hns3_nic_priv *priv = netdev_priv(netdev); struct hnae3_handle *h = priv->ae_handle; u8 mac_addr_temp[ETH_ALEN]; + int ret = 0; if (h->ae_algo->ops->get_mac_addr && init) { h->ae_algo->ops->get_mac_addr(h, mac_addr_temp); @@ -3276,8 +3283,9 @@ static void hns3_init_mac_addr(struct net_device *netdev, bool init) } if (h->ae_algo->ops->set_mac_addr) - h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true); + ret = h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true); + return ret; } static int hns3_restore_fd_rules(struct net_device *netdev) @@ -3490,20 +3498,29 @@ err_out: return ret; } -static void hns3_recover_hw_addr(struct net_device *ndev) +static int hns3_recover_hw_addr(struct net_device *ndev) { struct netdev_hw_addr_list *list; struct netdev_hw_addr *ha, *tmp; + int ret = 0; /* go through and sync uc_addr entries to the device */ list = &ndev->uc; - list_for_each_entry_safe(ha, tmp, &list->list, list) - hns3_nic_uc_sync(ndev, ha->addr); + list_for_each_entry_safe(ha, tmp, &list->list, list) { + ret = hns3_nic_uc_sync(ndev, ha->addr); + if (ret) + return ret; + } /* go through and sync mc_addr entries to the device */ list = &ndev->mc; - list_for_each_entry_safe(ha, tmp, &list->list, list) - hns3_nic_mc_sync(ndev, ha->addr); + list_for_each_entry_safe(ha, tmp, &list->list, list) { + ret = hns3_nic_mc_sync(ndev, ha->addr); + if (ret) + return ret; + } + + return ret; } static void hns3_remove_hw_addr(struct net_device *netdev) @@ -3630,7 +3647,10 @@ int hns3_nic_reset_all_ring(struct hnae3_handle *h) int ret; for (i = 0; i < h->kinfo.num_tqps; i++) { - h->ae_algo->ops->reset_queue(h, i); + ret = h->ae_algo->ops->reset_queue(h, i); + if (ret) + return ret; + hns3_init_ring_hw(priv->ring_data[i].ring); /* We need to clear tx ring here because self test will @@ -3722,18 +3742,30 @@ static int hns3_reset_notify_init_enet(struct hnae3_handle *handle) bool vlan_filter_enable; int ret; - hns3_init_mac_addr(netdev, false); - hns3_recover_hw_addr(netdev); - hns3_update_promisc_mode(netdev, handle->netdev_flags); + ret = hns3_init_mac_addr(netdev, false); + if (ret) + return ret; + + ret = hns3_recover_hw_addr(netdev); + if (ret) + return ret; + + ret = hns3_update_promisc_mode(netdev, handle->netdev_flags); + if (ret) + return ret; + vlan_filter_enable = netdev->flags & IFF_PROMISC ? false : true; hns3_enable_vlan_filter(netdev, vlan_filter_enable); - /* Hardware table is only clear when pf resets */ - if (!(handle->flags & HNAE3_SUPPORT_VF)) - hns3_restore_vlan(netdev); + if (!(handle->flags & HNAE3_SUPPORT_VF)) { + ret = hns3_restore_vlan(netdev); + return ret; + } - hns3_restore_fd_rules(netdev); + ret = hns3_restore_fd_rules(netdev); + if (ret) + return ret; /* Carrier off reporting is important to ethtool even BEFORE open */ netif_carrier_off(netdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 71cfca132d0b..d3636d088aa3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -640,7 +640,7 @@ void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector, u32 rl_value); void hns3_enable_vlan_filter(struct net_device *netdev, bool enable); -void hns3_update_promisc_mode(struct net_device *netdev, u8 promisc_flags); +int hns3_update_promisc_mode(struct net_device *netdev, u8 promisc_flags); #ifdef CONFIG_HNS3_DCB void hns3_dcbnl_setup(struct hnae3_handle *handle); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 4dd050688549..f3212c96bb64 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3314,8 +3314,8 @@ void hclge_promisc_param_init(struct hclge_promisc_param *param, bool en_uc, param->vf_id = vport_id; } -static void hclge_set_promisc_mode(struct hnae3_handle *handle, bool en_uc_pmc, - bool en_mc_pmc) +static int hclge_set_promisc_mode(struct hnae3_handle *handle, bool en_uc_pmc, + bool en_mc_pmc) { struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; @@ -3323,7 +3323,7 @@ static void hclge_set_promisc_mode(struct hnae3_handle *handle, bool en_uc_pmc, hclge_promisc_param_init(¶m, en_uc_pmc, en_mc_pmc, true, vport->vport_id); - hclge_cmd_set_promisc_mode(hdev, ¶m); + return hclge_cmd_set_promisc_mode(hdev, ¶m); } static int hclge_get_fd_mode(struct hclge_dev *hdev, u8 *fd_mode) @@ -6107,28 +6107,28 @@ static u16 hclge_covert_handle_qid_global(struct hnae3_handle *handle, return tqp->index; } -void hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id) +int hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id) { struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; int reset_try_times = 0; int reset_status; u16 queue_gid; - int ret; + int ret = 0; queue_gid = hclge_covert_handle_qid_global(handle, queue_id); ret = hclge_tqp_enable(hdev, queue_id, 0, false); if (ret) { - dev_warn(&hdev->pdev->dev, "Disable tqp fail, ret = %d\n", ret); - return; + dev_err(&hdev->pdev->dev, "Disable tqp fail, ret = %d\n", ret); + return ret; } ret = hclge_send_reset_tqp_cmd(hdev, queue_gid, true); if (ret) { - dev_warn(&hdev->pdev->dev, - "Send reset tqp cmd fail, ret = %d\n", ret); - return; + dev_err(&hdev->pdev->dev, + "Send reset tqp cmd fail, ret = %d\n", ret); + return ret; } reset_try_times = 0; @@ -6141,16 +6141,16 @@ void hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id) } if (reset_try_times >= HCLGE_TQP_RESET_TRY_TIMES) { - dev_warn(&hdev->pdev->dev, "Reset TQP fail\n"); - return; + dev_err(&hdev->pdev->dev, "Reset TQP fail\n"); + return ret; } ret = hclge_send_reset_tqp_cmd(hdev, queue_gid, false); - if (ret) { - dev_warn(&hdev->pdev->dev, - "Deassert the soft reset fail, ret = %d\n", ret); - return; - } + if (ret) + dev_err(&hdev->pdev->dev, + "Deassert the soft reset fail, ret = %d\n", ret); + + return ret; } void hclge_reset_vf_queue(struct hclge_vport *vport, u16 queue_id) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index e3dfd654eca9..0d9215404269 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -778,7 +778,7 @@ int hclge_rss_init_hw(struct hclge_dev *hdev); void hclge_rss_indir_init_cfg(struct hclge_dev *hdev); void hclge_mbx_handler(struct hclge_dev *hdev); -void hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id); +int hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id); void hclge_reset_vf_queue(struct hclge_vport *vport, u16 queue_id); int hclge_cfg_flowctrl(struct hclge_dev *hdev); int hclge_func_reset_cmd(struct hclge_dev *hdev, int func_id); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index e0a86a58342c..b224f6a34030 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -925,12 +925,12 @@ static int hclgevf_cmd_set_promisc_mode(struct hclgevf_dev *hdev, return status; } -static void hclgevf_set_promisc_mode(struct hnae3_handle *handle, - bool en_uc_pmc, bool en_mc_pmc) +static int hclgevf_set_promisc_mode(struct hnae3_handle *handle, + bool en_uc_pmc, bool en_mc_pmc) { struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); - hclgevf_cmd_set_promisc_mode(hdev, en_uc_pmc, en_mc_pmc); + return hclgevf_cmd_set_promisc_mode(hdev, en_uc_pmc, en_mc_pmc); } static int hclgevf_tqp_enable(struct hclgevf_dev *hdev, int tqp_id, @@ -1080,7 +1080,7 @@ static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) 1, false, NULL, 0); } -static void hclgevf_reset_tqp(struct hnae3_handle *handle, u16 queue_id) +static int hclgevf_reset_tqp(struct hnae3_handle *handle, u16 queue_id) { struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); u8 msg_data[2]; @@ -1091,10 +1091,10 @@ static void hclgevf_reset_tqp(struct hnae3_handle *handle, u16 queue_id) /* disable vf queue before send queue reset msg to PF */ ret = hclgevf_tqp_enable(hdev, queue_id, 0, false); if (ret) - return; + return ret; - hclgevf_send_mbx_msg(hdev, HCLGE_MBX_QUEUE_RESET, 0, msg_data, - 2, true, NULL, 0); + return hclgevf_send_mbx_msg(hdev, HCLGE_MBX_QUEUE_RESET, 0, msg_data, + 2, true, NULL, 0); } static int hclgevf_notify_client(struct hclgevf_dev *hdev, From 3c88ed1d798da355859ca083d3884a16ce0841f2 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:51 +0800 Subject: [PATCH 35/49] net: hns3: bugfix for handling mailbox while the command queue reinitialized In a multi-core machine, the mailbox service and reset service will be executed at the same time. The reset service will re-initialize the command queue, before that, the mailbox handler can only get some invalid messages. The HCLGE_STATE_CMD_DISABLE flag means that the command queue is not available and needs to be reinitialized. Therefore, when the mailbox handler recognizes this flag, it should not process the command. Fixes: dde1a86e93ca ("net: hns3: Add mailbox support to PF driver") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 04462a347a94..f890022938d9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -400,6 +400,12 @@ void hclge_mbx_handler(struct hclge_dev *hdev) /* handle all the mailbox requests in the queue */ while (!hclge_cmd_crq_empty(&hdev->hw)) { + if (test_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state)) { + dev_warn(&hdev->pdev->dev, + "command queue needs re-initializing\n"); + return; + } + desc = &crq->desc[crq->next_to_use]; req = (struct hclge_mbx_vf_to_pf_cmd *)desc->data; From a963052e539887df481d4d3a6ad4c92ca6461852 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:52 +0800 Subject: [PATCH 36/49] net: hns3: bugfix for rtnl_lock's range in the hclge_reset() Since hclge_reset_wait() is used to wait for the hardware to complete the reset, it is not necessary to hold the rtnl_lock during hclge_reset_wait(). So this patch releases the lock for the duration of hclge_reset_wait(). Fixes: 6d4fab39533f ("net: hns3: Reset net device with rtnl_lock") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index f3212c96bb64..ffdd96020860 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2470,14 +2470,17 @@ static void hclge_reset(struct hclge_dev *hdev) handle = &hdev->vport[0].nic; rtnl_lock(); hclge_notify_client(hdev, HNAE3_DOWN_CLIENT); + rtnl_unlock(); if (!hclge_reset_wait(hdev)) { + rtnl_lock(); hclge_notify_client(hdev, HNAE3_UNINIT_CLIENT); hclge_reset_ae_dev(hdev->ae_dev); hclge_notify_client(hdev, HNAE3_INIT_CLIENT); hclge_clear_reset_cause(hdev); } else { + rtnl_lock(); /* schedule again to check pending resets later */ set_bit(hdev->reset_type, &hdev->reset_pending); hclge_reset_task_schedule(hdev); From 29118ab962d5476fdc65fae312ac38db68092d78 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Tue, 30 Oct 2018 21:50:53 +0800 Subject: [PATCH 37/49] net: hns3: bugfix for rtnl_lock's range in the hclgevf_reset() Since hclgevf_reset_wait() is used to wait for the hardware to complete the reset, it is not necessary to hold the rtnl_lock during hclgevf_reset_wait(). So this patch releases the lock for the duration of hclgevf_reset_wait(). Fixes: 6988eb2a9b77 ("net: hns3: Add support to reset the enet/ring mgmt layer") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index b224f6a34030..085edb945389 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1170,6 +1170,8 @@ static int hclgevf_reset(struct hclgevf_dev *hdev) /* bring down the nic to stop any ongoing TX/RX */ hclgevf_notify_client(hdev, HNAE3_DOWN_CLIENT); + rtnl_unlock(); + /* check if VF could successfully fetch the hardware reset completion * status from the hardware */ @@ -1181,12 +1183,15 @@ static int hclgevf_reset(struct hclgevf_dev *hdev) ret); dev_warn(&hdev->pdev->dev, "VF reset failed, disabling VF!\n"); + rtnl_lock(); hclgevf_notify_client(hdev, HNAE3_UNINIT_CLIENT); rtnl_unlock(); return ret; } + rtnl_lock(); + /* now, re-initialize the nic client and ae device*/ ret = hclgevf_reset_stack(hdev); if (ret) From 8b931821aa04823e2e5df0ae93937baabbd23286 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 31 Oct 2018 09:56:42 +0000 Subject: [PATCH 38/49] mlxsw: reg: QEEC: Add minimum shaper fields Add QEEC.mise (minimum shaper enable) and QEEC.min_shaper_rate to enable configuration of minimum shaper. Increase the QEEC length to 0x20 as well: that's the length that the register has had for a long time now, but with the configurations that mlxsw typically exercises, the firmware tolerated 0x1C-sized packets. With mise=true however, FW rejects packets unless they have the full required length. Fixes: b9b7cee40579 ("mlxsw: reg: Add QoS ETS Element Configuration register") Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 32cb6718bb17..db3d2790aeec 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -3284,7 +3284,7 @@ static inline void mlxsw_reg_qtct_pack(char *payload, u8 local_port, * Configures the ETS elements. */ #define MLXSW_REG_QEEC_ID 0x400D -#define MLXSW_REG_QEEC_LEN 0x1C +#define MLXSW_REG_QEEC_LEN 0x20 MLXSW_REG_DEFINE(qeec, MLXSW_REG_QEEC_ID, MLXSW_REG_QEEC_LEN); @@ -3326,6 +3326,15 @@ MLXSW_ITEM32(reg, qeec, element_index, 0x04, 0, 8); */ MLXSW_ITEM32(reg, qeec, next_element_index, 0x08, 0, 8); +/* reg_qeec_mise + * Min shaper configuration enable. Enables configuration of the min + * shaper on this ETS element + * 0 - Disable + * 1 - Enable + * Access: RW + */ +MLXSW_ITEM32(reg, qeec, mise, 0x0C, 31, 1); + enum { MLXSW_REG_QEEC_BYTES_MODE, MLXSW_REG_QEEC_PACKETS_MODE, @@ -3342,6 +3351,17 @@ enum { */ MLXSW_ITEM32(reg, qeec, pb, 0x0C, 28, 1); +/* The smallest permitted min shaper rate. */ +#define MLXSW_REG_QEEC_MIS_MIN 200000 /* Kbps */ + +/* reg_qeec_min_shaper_rate + * Min shaper information rate. + * For CPU port, can only be configured for port hierarchy. + * When in bytes mode, value is specified in units of 1000bps. + * Access: RW + */ +MLXSW_ITEM32(reg, qeec, min_shaper_rate, 0x0C, 0, 28); + /* reg_qeec_mase * Max shaper configuration enable. Enables configuration of the max * shaper on this ETS element. From 0fe64023162aef123de2f1993ba13a35a786e1de Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 31 Oct 2018 09:56:44 +0000 Subject: [PATCH 39/49] mlxsw: spectrum: Set minimum shaper on MC TCs An MC-aware mode was introduced in commit 7b8195306694 ("mlxsw: spectrum: Configure MC-aware mode on mlxsw ports"). In MC-aware mode, BUM traffic gets a special treatment by being assigned to a separate set of traffic classes 8..15. Pairs of TCs 0 and 8, 1 and 9, etc., are then configured to strictly prioritize the lower-numbered ones. The intention is to prevent BUM traffic from flooding the switch and push out all UC traffic, which would otherwise happen, and instead give UC traffic precedence. However strictly prioritizing UC traffic has the effect that UC overload pushes out all BUM traffic, such as legitimate ARP queries. These packets are kept in queues for a while, but under sustained UC overload, their lifetime eventually expires and these packets are dropped. That is detrimental to network performance as well. Therefore configure the MC TCs (8..15) with minimum shaper of 200Mbps (a minimum permitted value) to allow a trickle of necessary control traffic to get through. Fixes: 7b8195306694 ("mlxsw: spectrum: Configure MC-aware mode on mlxsw ports") Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 8a4983adae94..a2df12b79f8e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2740,6 +2740,21 @@ int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port, return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl); } +static int mlxsw_sp_port_min_bw_set(struct mlxsw_sp_port *mlxsw_sp_port, + enum mlxsw_reg_qeec_hr hr, u8 index, + u8 next_index, u32 minrate) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char qeec_pl[MLXSW_REG_QEEC_LEN]; + + mlxsw_reg_qeec_pack(qeec_pl, mlxsw_sp_port->local_port, hr, index, + next_index); + mlxsw_reg_qeec_mise_set(qeec_pl, true); + mlxsw_reg_qeec_min_shaper_rate_set(qeec_pl, minrate); + + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl); +} + int mlxsw_sp_port_prio_tc_set(struct mlxsw_sp_port *mlxsw_sp_port, u8 switch_prio, u8 tclass) { @@ -2817,6 +2832,16 @@ static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port) return err; } + /* Configure the min shaper for multicast TCs. */ + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + err = mlxsw_sp_port_min_bw_set(mlxsw_sp_port, + MLXSW_REG_QEEC_HIERARCY_TC, + i + 8, i, + MLXSW_REG_QEEC_MIS_MIN); + if (err) + return err; + } + /* Map all priorities to traffic class 0. */ for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { err = mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i, 0); From 8f3f09358c81248109463b3cae254b7db4ea9af0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 31 Oct 2018 09:56:45 +0000 Subject: [PATCH 40/49] selftests: mlxsw: qos_mc_aware: Tweak for min shaper Since the minimum shaper is now being enabled for MC TCs, it's unreasonable to expect no UC traffic loss. Minimal min shaper value is 200Mbps, which is 20% of the 1Gbps that this test configures on egress. To cover for glitches, tolerate up to 25% UC degradation under MC overload. Fixes: b5638d46c90a ("selftests: mlxsw: Add a test for UC behavior under MC flood") Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh index 0150bb2741eb..a8fc36d670e1 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh @@ -311,7 +311,7 @@ test_mc_aware() ret = 100 * ($ucth1 - $ucth2) / $ucth1 if (ret > 0) { ret } else { 0 } ") - check_err $(bc <<< "$deg > 10") + check_err $(bc <<< "$deg > 25") local interval=$((d1 - d0)) local mc_ir=$(rate $u0 $u1 $interval) From a5ee171d087ee632a5f190bb3ce1c0f98e06ec0a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 31 Oct 2018 09:56:46 +0000 Subject: [PATCH 41/49] selftests: mlxsw: qos_mc_aware: Add a test for UC awareness In a previous patch, mlxsw was updated to configure a minimum bandwidth allowance on MC TCs. Test that this indeed fixes the problem of UC traffic overload pushing out all MC traffic. Fixes: b5638d46c90a ("selftests: mlxsw: Add a test for UC behavior under MC flood") Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/qos_mc_aware.sh | 93 ++++++++++++++----- 1 file changed, 70 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh index a8fc36d670e1..117f6f35d72f 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh @@ -25,24 +25,24 @@ # Thus we set MTU to 10K on all involved interfaces. Then both unicast and # multicast traffic uses 8K frames. # -# +-----------------------+ +----------------------------------+ -# | H1 | | H2 | -# | | | unicast --> + $h2.111 | -# | | | traffic | 192.0.2.129/28 | -# | multicast | | | e-qos-map 0:1 | -# | traffic | | | | -# | $h1 + <----- | | + $h2 | -# +-----|-----------------+ +--------------|-------------------+ -# | | -# +-----|-------------------------------------------------|-------------------+ -# | + $swp1 + $swp2 | -# | | >1Gbps | >1Gbps | -# | +---|----------------+ +----------|----------------+ | -# | | + $swp1.1 | | + $swp2.111 | | +# +---------------------------+ +----------------------------------+ +# | H1 | | H2 | +# | | | unicast --> + $h2.111 | +# | multicast | | traffic | 192.0.2.129/28 | +# | traffic | | | e-qos-map 0:1 | +# | $h1 + <----- | | | | +# | 192.0.2.65/28 | | | + $h2 | +# +---------------|-----------+ +--------------|-------------------+ +# | | +# +---------------|---------------------------------------|-------------------+ +# | $swp1 + + $swp2 | +# | >1Gbps | | >1Gbps | +# | +-------------|------+ +----------|----------------+ | +# | | $swp1.1 + | | + $swp2.111 | | # | | BR1 | SW | BR111 | | -# | | + $swp3.1 | | + $swp3.111 | | -# | +---|----------------+ +----------|----------------+ | -# | \_________________________________________________/ | +# | | $swp3.1 + | | + $swp3.111 | | +# | +-------------|------+ +----------|----------------+ | +# | \_______________________________________/ | # | | | # | + $swp3 | # | | 1Gbps bottleneck | @@ -51,6 +51,7 @@ # | # +--|-----------------+ # | + $h3 H3 | +# | | 192.0.2.66/28 | # | | | # | + $h3.111 | # | 192.0.2.130/28 | @@ -59,6 +60,7 @@ ALL_TESTS=" ping_ipv4 test_mc_aware + test_uc_aware " lib_dir=$(dirname $0)/../../../net/forwarding @@ -68,14 +70,14 @@ source $lib_dir/lib.sh h1_create() { - simple_if_init $h1 + simple_if_init $h1 192.0.2.65/28 mtu_set $h1 10000 } h1_destroy() { mtu_restore $h1 - simple_if_fini $h1 + simple_if_fini $h1 192.0.2.65/28 } h2_create() @@ -97,7 +99,7 @@ h2_destroy() h3_create() { - simple_if_init $h3 + simple_if_init $h3 192.0.2.66/28 mtu_set $h3 10000 vlan_create $h3 111 v$h3 192.0.2.130/28 @@ -108,7 +110,7 @@ h3_destroy() vlan_destroy $h3 111 mtu_restore $h3 - simple_if_fini $h3 + simple_if_fini $h3 192.0.2.66/28 } switch_create() @@ -251,7 +253,7 @@ measure_uc_rate() # average ingress rate to somewhat mitigate this. local min_ingress=2147483648 - mausezahn $h2.111 -p 8000 -A 192.0.2.129 -B 192.0.2.130 -c 0 \ + $MZ $h2.111 -p 8000 -A 192.0.2.129 -B 192.0.2.130 -c 0 \ -a own -b $h3mac -t udp -q & sleep 1 @@ -291,7 +293,7 @@ test_mc_aware() check_err $? "Could not get high enough UC-only ingress rate" local ucth1=${uc_rate[1]} - mausezahn $h1 -p 8000 -c 0 -a own -b bc -t udp -q & + $MZ $h1 -p 8000 -c 0 -a own -b bc -t udp -q & local d0=$(date +%s) local t0=$(ethtool_stats_get $h3 rx_octets_prio_0) @@ -335,6 +337,51 @@ test_mc_aware() echo " egress UC throughput $(humanize ${uc_rate_2[1]})" echo " ingress MC throughput $(humanize $mc_ir)" echo " egress MC throughput $(humanize $mc_er)" + echo +} + +test_uc_aware() +{ + RET=0 + + $MZ $h2.111 -p 8000 -A 192.0.2.129 -B 192.0.2.130 -c 0 \ + -a own -b $h3mac -t udp -q & + + local d0=$(date +%s) + local t0=$(ethtool_stats_get $h3 rx_octets_prio_1) + local u0=$(ethtool_stats_get $swp2 rx_octets_prio_1) + sleep 1 + + local attempts=50 + local passes=0 + local i + + for ((i = 0; i < attempts; ++i)); do + if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 0.1; then + ((passes++)) + fi + + sleep 0.1 + done + + local d1=$(date +%s) + local t1=$(ethtool_stats_get $h3 rx_octets_prio_1) + local u1=$(ethtool_stats_get $swp2 rx_octets_prio_1) + + local interval=$((d1 - d0)) + local uc_ir=$(rate $u0 $u1 $interval) + local uc_er=$(rate $t0 $t1 $interval) + + ((attempts == passes)) + check_err $? + + # Suppress noise from killing mausezahn. + { kill %% && wait; } 2>/dev/null + + log_test "MC performace under UC overload" + echo " ingress UC throughput $(humanize ${uc_ir})" + echo " egress UC throughput $(humanize ${uc_er})" + echo " sent $attempts BC ARPs, got $passes responses" } trap cleanup EXIT From deee2cae27d1914850195e3fb219cc611e953560 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 30 Oct 2018 11:15:55 +0800 Subject: [PATCH 42/49] kselftests/bpf: use ping6 as the default ipv6 ping binary if it exists ping binary on some distros doesn't support "ping -6" anymore. Signed-off-by: Li Zhijian Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_skb_cgroup_id.sh | 3 ++- tools/testing/selftests/bpf/test_sock_addr.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh index 42544a969abc..a9bc6f82abc1 100755 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh @@ -10,7 +10,7 @@ wait_for_ip() echo -n "Wait for testing link-local IP to become available " for _i in $(seq ${MAX_PING_TRIES}); do echo -n "." - if ping -6 -q -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then + if $PING6 -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then echo " OK" return fi @@ -58,5 +58,6 @@ BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o" BPF_PROG_SECTION="cgroup_id_logger" BPF_PROG_ID=0 PROG="${DIR}/test_skb_cgroup_id_user" +type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6" main diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh index 9832a875a828..3b9fdb8094aa 100755 --- a/tools/testing/selftests/bpf/test_sock_addr.sh +++ b/tools/testing/selftests/bpf/test_sock_addr.sh @@ -4,7 +4,8 @@ set -eu ping_once() { - ping -${1} -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1 + type ping${1} >/dev/null 2>&1 && PING="ping${1}" || PING="ping -${1}" + $PING -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1 } wait_for_ip() From 3615353218744bb60f55170c620ce4dce1a008c7 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 31 Oct 2018 12:57:18 -0700 Subject: [PATCH 43/49] libbpf: Fix compile error in libbpf_attach_type_by_name Arnaldo Carvalho de Melo reported build error in libbpf when clang version 3.8.1-24 (tags/RELEASE_381/final) is used: libbpf.c:2201:36: error: comparison of constant -22 with expression of type 'const enum bpf_attach_type' is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (section_names[i].attach_type == -EINVAL) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~ 1 error generated. Fix the error by keeping "is_attachable" property of a program in a separate struct field instead of trying to use attach_type itself. Fixes: 956b620fcf0b ("libbpf: Introduce libbpf_attach_type_by_name") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Andrey Ignatov Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b607be7236d3..d6e62e90e8d4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2084,19 +2084,19 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog, prog->expected_attach_type = type; } -#define BPF_PROG_SEC_IMPL(string, ptype, eatype, atype) \ - { string, sizeof(string) - 1, ptype, eatype, atype } +#define BPF_PROG_SEC_IMPL(string, ptype, eatype, is_attachable, atype) \ + { string, sizeof(string) - 1, ptype, eatype, is_attachable, atype } /* Programs that can NOT be attached. */ -#define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_IMPL(string, ptype, 0, -EINVAL) +#define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_IMPL(string, ptype, 0, 0, 0) /* Programs that can be attached. */ #define BPF_APROG_SEC(string, ptype, atype) \ - BPF_PROG_SEC_IMPL(string, ptype, 0, atype) + BPF_PROG_SEC_IMPL(string, ptype, 0, 1, atype) /* Programs that must specify expected attach type at load time. */ #define BPF_EAPROG_SEC(string, ptype, eatype) \ - BPF_PROG_SEC_IMPL(string, ptype, eatype, eatype) + BPF_PROG_SEC_IMPL(string, ptype, eatype, 1, eatype) /* Programs that can be attached but attach type can't be identified by section * name. Kept for backward compatibility. @@ -2108,6 +2108,7 @@ static const struct { size_t len; enum bpf_prog_type prog_type; enum bpf_attach_type expected_attach_type; + int is_attachable; enum bpf_attach_type attach_type; } section_names[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), @@ -2198,7 +2199,7 @@ int libbpf_attach_type_by_name(const char *name, for (i = 0; i < ARRAY_SIZE(section_names); i++) { if (strncmp(name, section_names[i].sec, section_names[i].len)) continue; - if (section_names[i].attach_type == -EINVAL) + if (!section_names[i].is_attachable) return -EINVAL; *attach_type = section_names[i].attach_type; return 0; From 0962590e553331db2cc0aef2dc35c57f6300dbbe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:52 +0100 Subject: [PATCH 44/49] bpf: fix partial copy of map_ptr when dst is scalar ALU operations on pointers such as scalar_reg += map_value_ptr are handled in adjust_ptr_min_max_vals(). Problem is however that map_ptr and range in the register state share a union, so transferring state through dst_reg->range = ptr_reg->range is just buggy as any new map_ptr in the dst_reg is then truncated (or null) for subsequent checks. Fix this by adding a raw member and use it for copying state over to dst_reg. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Daniel Borkmann Cc: Edward Cree Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ kernel/bpf/verifier.c | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9e8056ec20fa..d93e89761a8b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -51,6 +51,9 @@ struct bpf_reg_state { * PTR_TO_MAP_VALUE_OR_NULL */ struct bpf_map *map_ptr; + + /* Max size from any of the above. */ + unsigned long raw; }; /* Fixed part of pointer offset, pointer types only */ s32 off; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 171a2c88e77d..774fa40a32ae 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3046,7 +3046,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr; dst_reg->var_off = ptr_reg->var_off; dst_reg->off = ptr_reg->off + smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. Note that off_reg->off @@ -3076,10 +3076,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_SUB: @@ -3108,7 +3109,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = ptr_reg->var_off; dst_reg->id = ptr_reg->id; dst_reg->off = ptr_reg->off - smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. If the subtrahend is known @@ -3134,11 +3135,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_AND: From 4d31f30148cea6e97e42616231eed55295117fe7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:53 +0100 Subject: [PATCH 45/49] bpf: don't set id on after map lookup with ptr_to_map_val return In the verifier there is no such semantics where registers with PTR_TO_MAP_VALUE type have an id assigned to them. This is only used in PTR_TO_MAP_VALUE_OR_NULL and later on nullified once the test against NULL has been pattern matched and type transformed into PTR_TO_MAP_VALUE. Fixes: 3e6a4b3e0289 ("bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE") Signed-off-by: Daniel Borkmann Cc: Roman Gushchin Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 774fa40a32ae..1971ca325fb4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2852,10 +2852,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || fn->ret_type == RET_PTR_TO_MAP_VALUE) { - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - else - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -2868,7 +2864,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].id = ++env->id_gen; + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + } else { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { int id = acquire_reference_state(env, insn_idx); if (id < 0) From 2683f4128c8730699296827d3209d2de80fa1d6c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:54 +0100 Subject: [PATCH 46/49] bpf: add various test cases to test_verifier Add some more map related test cases to test_verifier kselftest to improve test coverage. Summary: 1012 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 250 ++++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 36f3d3009d1a..4c7445d4b3e6 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -6454,6 +6454,256 @@ static struct bpf_test tests[] = { .errstr = "R1 min value is negative", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, + { + "map access: known scalar += value_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 1, + }, + { + "map access: value_ptr += known scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 1, + }, + { + "map access: unknown scalar += value_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 1, + }, + { + "map access: value_ptr += unknown scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 1, + }, + { + "map access: value_ptr += value_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R0 pointer += pointer prohibited", + }, + { + "map access: known scalar -= value_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R1 tried to subtract pointer from scalar", + }, + { + "map access: value_ptr -= known scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R0 min value is outside of the array range", + }, + { + "map access: value_ptr -= known scalar, 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_2), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 1, + }, + { + "map access: unknown scalar -= value_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R1 tried to subtract pointer from scalar", + }, + { + "map access: value_ptr -= unknown scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R0 min value is negative", + }, + { + "map access: value_ptr -= unknown scalar, 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf), + BPF_ALU64_IMM(BPF_OR, BPF_REG_1, 0x7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 1, + }, + { + "map access: value_ptr -= value_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R0 invalid mem access 'inv'", + .errstr_unpriv = "R0 pointer -= pointer prohibited", + }, { "map lookup helper access to map", .insns = { From 832c6f2c29ec519b766923937f4f93fb1008b47d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:55 +0100 Subject: [PATCH 47/49] bpf: test make sure to run unpriv test cases in test_verifier Right now unprivileged tests are never executed as a BPF test run, only loaded. Allow for running them as well so that we can check the outcome and probe for regressions. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 71 ++++++++++++--------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 4c7445d4b3e6..6f61df62f690 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -76,7 +76,7 @@ struct bpf_test { int fixup_percpu_cgroup_storage[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; - uint32_t retval; + uint32_t retval, retval_unpriv; enum { UNDEF, ACCEPT, @@ -3084,6 +3084,8 @@ static struct bpf_test tests[] = { .fixup_prog1 = { 2 }, .result = ACCEPT, .retval = 42, + /* Verifier rewrite for unpriv skips tail call here. */ + .retval_unpriv = 2, }, { "stack pointer arithmetic", @@ -14149,6 +14151,33 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type, } } +static int set_admin(bool admin) +{ + cap_t caps; + const cap_value_t cap_val = CAP_SYS_ADMIN; + int ret = -1; + + caps = cap_get_proc(); + if (!caps) { + perror("cap_get_proc"); + return -1; + } + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, + admin ? CAP_SET : CAP_CLEAR)) { + perror("cap_set_flag"); + goto out; + } + if (cap_set_proc(caps)) { + perror("cap_set_proc"); + goto out; + } + ret = 0; +out: + if (cap_free(caps)) + perror("cap_free"); + return ret; +} + static void do_test_single(struct bpf_test *test, bool unpriv, int *passes, int *errors) { @@ -14157,6 +14186,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, struct bpf_insn *prog = test->insns; int map_fds[MAX_NR_MAPS]; const char *expected_err; + uint32_t expected_val; uint32_t retval; int i, err; @@ -14176,6 +14206,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, test->result_unpriv : test->result; expected_err = unpriv && test->errstr_unpriv ? test->errstr_unpriv : test->errstr; + expected_val = unpriv && test->retval_unpriv ? + test->retval_unpriv : test->retval; reject_from_alignment = fd_prog < 0 && (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) && @@ -14209,16 +14241,20 @@ static void do_test_single(struct bpf_test *test, bool unpriv, __u8 tmp[TEST_DATA_LEN << 2]; __u32 size_tmp = sizeof(tmp); + if (unpriv) + set_admin(true); err = bpf_prog_test_run(fd_prog, 1, test->data, sizeof(test->data), tmp, &size_tmp, &retval, NULL); + if (unpriv) + set_admin(false); if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { printf("Unexpected bpf_prog_test_run error\n"); goto fail_log; } - if (!err && retval != test->retval && - test->retval != POINTER_VALUE) { - printf("FAIL retval %d != %d\n", retval, test->retval); + if (!err && retval != expected_val && + expected_val != POINTER_VALUE) { + printf("FAIL retval %d != %d\n", retval, expected_val); goto fail_log; } } @@ -14261,33 +14297,6 @@ static bool is_admin(void) return (sysadmin == CAP_SET); } -static int set_admin(bool admin) -{ - cap_t caps; - const cap_value_t cap_val = CAP_SYS_ADMIN; - int ret = -1; - - caps = cap_get_proc(); - if (!caps) { - perror("cap_get_proc"); - return -1; - } - if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, - admin ? CAP_SET : CAP_CLEAR)) { - perror("cap_set_flag"); - goto out; - } - if (cap_set_proc(caps)) { - perror("cap_set_proc"); - goto out; - } - ret = 0; -out: - if (cap_free(caps)) - perror("cap_free"); - return ret; -} - static void get_unpriv_disabled() { char buf[2]; From 30549aab146ccb1275230c3b4b4bc6b4181fd54e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 31 Oct 2018 16:08:10 +0100 Subject: [PATCH 48/49] net: stmmac: Fix stmmac_mdio_reset() when building stmmac as modules When building stmmac, it is only possible to select CONFIG_DWMAC_GENERIC, or any of the glue drivers, when CONFIG_STMMAC_PLATFORM is set. The only exception is CONFIG_STMMAC_PCI. When calling of_mdiobus_register(), it will call our ->reset() callback, which is set to stmmac_mdio_reset(). Most of the code in stmmac_mdio_reset() is protected by a "#if defined(CONFIG_STMMAC_PLATFORM)", which will evaluate to false when CONFIG_STMMAC_PLATFORM=m. Because of this, the phy reset gpio will only be pulled when stmmac is built as built-in, but not when built as modules. Fix this by using "#if IS_ENABLED()" instead of "#if defined()". Signed-off-by: Niklas Cassel Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index b72ef171477e..bdd351597b55 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -243,7 +243,7 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg, */ int stmmac_mdio_reset(struct mii_bus *bus) { -#if defined(CONFIG_STMMAC_PLATFORM) +#if IS_ENABLED(CONFIG_STMMAC_PLATFORM) struct net_device *ndev = bus->priv; struct stmmac_priv *priv = netdev_priv(ndev); unsigned int mii_address = priv->hw->mii.addr; From 46ebe2834ba5b541f28ee72e556a3fed42c47570 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Caama=C3=B1o=20Ruiz?= Date: Wed, 31 Oct 2018 18:52:03 +0100 Subject: [PATCH 49/49] openvswitch: Fix push/pop ethernet validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When there are both pop and push ethernet header actions among the actions to be applied to a packet, an unexpected EINVAL (Invalid argument) error is obtained. This is due to mac_proto not being reset correctly when those actions are validated. Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-October/047554.html Fixes: 91820da6ae85 ("openvswitch: add Ethernet push and pop actions") Signed-off-by: Jaime Caamaño Ruiz Tested-by: Greg Rose Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a70097ecf33c..865ecef68196 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3030,7 +3030,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, * is already present */ if (mac_proto != MAC_PROTO_NONE) return -EINVAL; - mac_proto = MAC_PROTO_NONE; + mac_proto = MAC_PROTO_ETHERNET; break; case OVS_ACTION_ATTR_POP_ETH: @@ -3038,7 +3038,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; if (vlan_tci & htons(VLAN_TAG_PRESENT)) return -EINVAL; - mac_proto = MAC_PROTO_ETHERNET; + mac_proto = MAC_PROTO_NONE; break; case OVS_ACTION_ATTR_PUSH_NSH: