2019-05-27 14:55:01 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2014-11-28 21:34:17 +08:00
|
|
|
/*
|
|
|
|
* include/net/switchdev.h - Switch device API
|
2015-09-24 16:02:41 +08:00
|
|
|
* Copyright (c) 2014-2015 Jiri Pirko <jiri@resnulli.us>
|
2015-03-10 04:59:09 +08:00
|
|
|
* Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com>
|
2014-11-28 21:34:17 +08:00
|
|
|
*/
|
|
|
|
#ifndef _LINUX_SWITCHDEV_H_
|
|
|
|
#define _LINUX_SWITCHDEV_H_
|
|
|
|
|
|
|
|
#include <linux/netdevice.h>
|
2015-01-16 06:49:36 +08:00
|
|
|
#include <linux/notifier.h>
|
2015-09-24 16:02:41 +08:00
|
|
|
#include <linux/list.h>
|
2015-10-15 01:40:51 +08:00
|
|
|
#include <net/ip_fib.h>
|
2015-01-16 06:49:36 +08:00
|
|
|
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
#define SWITCHDEV_F_NO_RECURSE BIT(0)
|
2015-10-09 10:23:18 +08:00
|
|
|
#define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1)
|
2015-10-15 01:40:50 +08:00
|
|
|
#define SWITCHDEV_F_DEFER BIT(2)
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
|
|
|
|
enum switchdev_attr_id {
|
2015-10-01 17:03:42 +08:00
|
|
|
SWITCHDEV_ATTR_ID_UNDEFINED,
|
|
|
|
SWITCHDEV_ATTR_ID_PORT_STP_STATE,
|
|
|
|
SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
|
2019-02-21 08:58:19 +08:00
|
|
|
SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS,
|
2017-02-09 21:54:42 +08:00
|
|
|
SWITCHDEV_ATTR_ID_PORT_MROUTER,
|
2015-10-09 10:23:17 +08:00
|
|
|
SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
|
2016-01-06 20:01:05 +08:00
|
|
|
SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
|
2020-11-29 20:54:05 +08:00
|
|
|
SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL,
|
2017-02-09 21:54:40 +08:00
|
|
|
SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
|
2017-10-09 17:15:31 +08:00
|
|
|
SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
|
2020-04-26 21:22:03 +08:00
|
|
|
SWITCHDEV_ATTR_ID_MRP_PORT_ROLE,
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
};
|
|
|
|
|
net: switchdev: pass flags and mask to both {PRE_,}BRIDGE_FLAGS attributes
This switchdev attribute offers a counterproductive API for a driver
writer, because although br_switchdev_set_port_flag gets passed a
"flags" and a "mask", those are passed piecemeal to the driver, so while
the PRE_BRIDGE_FLAGS listener knows what changed because it has the
"mask", the BRIDGE_FLAGS listener doesn't, because it only has the final
value. But certain drivers can offload only certain combinations of
settings, like for example they cannot change unicast flooding
independently of multicast flooding - they must be both on or both off.
The way the information is passed to switchdev makes drivers not
expressive enough, and unable to reject this request ahead of time, in
the PRE_BRIDGE_FLAGS notifier, so they are forced to reject it during
the deferred BRIDGE_FLAGS attribute, where the rejection is currently
ignored.
This patch also changes drivers to make use of the "mask" field for edge
detection when possible.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-12 23:15:55 +08:00
|
|
|
struct switchdev_brport_flags {
|
|
|
|
unsigned long val;
|
|
|
|
unsigned long mask;
|
|
|
|
};
|
|
|
|
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
struct switchdev_attr {
|
2015-12-15 23:03:35 +08:00
|
|
|
struct net_device *orig_dev;
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
enum switchdev_attr_id id;
|
|
|
|
u32 flags;
|
2016-04-21 18:52:43 +08:00
|
|
|
void *complete_priv;
|
|
|
|
void (*complete)(struct net_device *dev, int err, void *priv);
|
2015-05-11 00:47:49 +08:00
|
|
|
union {
|
2015-05-11 00:47:51 +08:00
|
|
|
u8 stp_state; /* PORT_STP_STATE */
|
net: switchdev: pass flags and mask to both {PRE_,}BRIDGE_FLAGS attributes
This switchdev attribute offers a counterproductive API for a driver
writer, because although br_switchdev_set_port_flag gets passed a
"flags" and a "mask", those are passed piecemeal to the driver, so while
the PRE_BRIDGE_FLAGS listener knows what changed because it has the
"mask", the BRIDGE_FLAGS listener doesn't, because it only has the final
value. But certain drivers can offload only certain combinations of
settings, like for example they cannot change unicast flooding
independently of multicast flooding - they must be both on or both off.
The way the information is passed to switchdev makes drivers not
expressive enough, and unable to reject this request ahead of time, in
the PRE_BRIDGE_FLAGS notifier, so they are forced to reject it during
the deferred BRIDGE_FLAGS attribute, where the rejection is currently
ignored.
This patch also changes drivers to make use of the "mask" field for edge
detection when possible.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-12 23:15:55 +08:00
|
|
|
struct switchdev_brport_flags brport_flags; /* PORT_BRIDGE_FLAGS */
|
2017-02-09 21:54:42 +08:00
|
|
|
bool mrouter; /* PORT_MROUTER */
|
2016-07-19 03:02:06 +08:00
|
|
|
clock_t ageing_time; /* BRIDGE_AGEING_TIME */
|
2016-01-06 20:01:05 +08:00
|
|
|
bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */
|
2020-11-29 20:54:05 +08:00
|
|
|
u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */
|
2017-02-09 21:54:40 +08:00
|
|
|
bool mc_disabled; /* MC_DISABLED */
|
2020-04-26 21:22:03 +08:00
|
|
|
u8 mrp_port_role; /* MRP_PORT_ROLE */
|
2015-05-14 02:16:50 +08:00
|
|
|
} u;
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
};
|
|
|
|
|
2015-05-11 00:47:52 +08:00
|
|
|
enum switchdev_obj_id {
|
2015-10-01 17:03:41 +08:00
|
|
|
SWITCHDEV_OBJ_ID_UNDEFINED,
|
|
|
|
SWITCHDEV_OBJ_ID_PORT_VLAN,
|
2016-01-11 04:06:22 +08:00
|
|
|
SWITCHDEV_OBJ_ID_PORT_MDB,
|
2017-11-10 06:10:59 +08:00
|
|
|
SWITCHDEV_OBJ_ID_HOST_MDB,
|
2020-04-26 21:22:03 +08:00
|
|
|
SWITCHDEV_OBJ_ID_MRP,
|
|
|
|
SWITCHDEV_OBJ_ID_RING_TEST_MRP,
|
|
|
|
SWITCHDEV_OBJ_ID_RING_ROLE_MRP,
|
|
|
|
SWITCHDEV_OBJ_ID_RING_STATE_MRP,
|
2020-07-14 15:34:47 +08:00
|
|
|
SWITCHDEV_OBJ_ID_IN_TEST_MRP,
|
|
|
|
SWITCHDEV_OBJ_ID_IN_ROLE_MRP,
|
|
|
|
SWITCHDEV_OBJ_ID_IN_STATE_MRP,
|
2015-05-11 00:47:52 +08:00
|
|
|
};
|
|
|
|
|
2015-10-01 17:03:45 +08:00
|
|
|
struct switchdev_obj {
|
net: bridge: add helper to replay port and host-joined mdb entries
I have a system with DSA ports, and udhcpcd is configured to bring
interfaces up as soon as they are created.
I create a bridge as follows:
ip link add br0 type bridge
As soon as I create the bridge and udhcpcd brings it up, I also have
avahi which automatically starts sending IPv6 packets to advertise some
local services, and because of that, the br0 bridge joins the following
IPv6 groups due to the code path detailed below:
33:33:ff:6d:c1:9c vid 0
33:33:00:00:00:6a vid 0
33:33:00:00:00:fb vid 0
br_dev_xmit
-> br_multicast_rcv
-> br_ip6_multicast_add_group
-> __br_multicast_add_group
-> br_multicast_host_join
-> br_mdb_notify
This is all fine, but inside br_mdb_notify we have br_mdb_switchdev_host
hooked up, and switchdev will attempt to offload the host joined groups
to an empty list of ports. Of course nobody offloads them.
Then when we add a port to br0:
ip link set swp0 master br0
the bridge doesn't replay the host-joined MDB entries from br_add_if,
and eventually the host joined addresses expire, and a switchdev
notification for deleting it is emitted, but surprise, the original
addition was already completely missed.
The strategy to address this problem is to replay the MDB entries (both
the port ones and the host joined ones) when the new port joins the
bridge, similar to what vxlan_fdb_replay does (in that case, its FDB can
be populated and only then attached to a bridge that you offload).
However there are 2 possibilities: the addresses can be 'pushed' by the
bridge into the port, or the port can 'pull' them from the bridge.
Considering that in the general case, the new port can be really late to
the party, and there may have been many other switchdev ports that
already received the initial notification, we would like to avoid
delivering duplicate events to them, since they might misbehave. And
currently, the bridge calls the entire switchdev notifier chain, whereas
for replaying it should just call the notifier block of the new guy.
But the bridge doesn't know what is the new guy's notifier block, it
just knows where the switchdev notifier chain is. So for simplification,
we make this a driver-initiated pull for now, and the notifier block is
passed as an argument.
To emulate the calling context for mdb objects (deferred and put on the
blocking notifier chain), we must iterate under RCU protection through
the bridge's mdb entries, queue them, and only call them once we're out
of the RCU read-side critical section.
There was some opportunity for reuse between br_mdb_switchdev_host_port,
br_mdb_notify and the newly added br_mdb_queue_one in how the switchdev
mdb object is created, so a helper was created.
Suggested-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-03-23 07:51:44 +08:00
|
|
|
struct list_head list;
|
2015-12-15 23:03:35 +08:00
|
|
|
struct net_device *orig_dev;
|
2015-10-01 17:03:46 +08:00
|
|
|
enum switchdev_obj_id id;
|
2015-10-15 01:40:52 +08:00
|
|
|
u32 flags;
|
2016-04-21 18:52:43 +08:00
|
|
|
void *complete_priv;
|
|
|
|
void (*complete)(struct net_device *dev, int err, void *priv);
|
2015-10-01 17:03:45 +08:00
|
|
|
};
|
|
|
|
|
2015-10-01 17:03:41 +08:00
|
|
|
/* SWITCHDEV_OBJ_ID_PORT_VLAN */
|
2015-10-01 17:03:43 +08:00
|
|
|
struct switchdev_obj_port_vlan {
|
2015-10-01 17:03:45 +08:00
|
|
|
struct switchdev_obj obj;
|
2015-09-30 00:07:18 +08:00
|
|
|
u16 flags;
|
net: switchdev: remove vid_begin -> vid_end range from VLAN objects
The call path of a switchdev VLAN addition to the bridge looks something
like this today:
nbp_vlan_init
| __br_vlan_set_default_pvid
| | |
| | br_afspec |
| | | |
| | v |
| | br_process_vlan_info |
| | | |
| | v |
| | br_vlan_info |
| | / \ /
| | / \ /
| | / \ /
| | / \ /
v v v v v
nbp_vlan_add br_vlan_add ------+
| ^ ^ | |
| / | | |
| / / / |
\ br_vlan_get_master/ / v
\ ^ / / br_vlan_add_existing
\ | / / |
\ | / / /
\ | / / /
\ | / / /
\ | / / /
v | | v /
__vlan_add /
/ | /
/ | /
v | /
__vlan_vid_add | /
\ | /
v v v
br_switchdev_port_vlan_add
The ranges UAPI was introduced to the bridge in commit bdced7ef7838
("bridge: support for multiple vlans and vlan ranges in setlink and
dellink requests") (Jan 10 2015). But the VLAN ranges (parsed in br_afspec)
have always been passed one by one, through struct bridge_vlan_info
tmp_vinfo, to br_vlan_info. So the range never went too far in depth.
Then Scott Feldman introduced the switchdev_port_bridge_setlink function
in commit 47f8328bb1a4 ("switchdev: add new switchdev bridge setlink").
That marked the introduction of the SWITCHDEV_OBJ_PORT_VLAN, which made
full use of the range. But switchdev_port_bridge_setlink was called like
this:
br_setlink
-> br_afspec
-> switchdev_port_bridge_setlink
Basically, the switchdev and the bridge code were not tightly integrated.
Then commit 41c498b9359e ("bridge: restore br_setlink back to original")
came, and switchdev drivers were required to implement
.ndo_bridge_setlink = switchdev_port_bridge_setlink for a while.
In the meantime, commits such as 0944d6b5a2fa ("bridge: try switchdev op
first in __vlan_vid_add/del") finally made switchdev penetrate the
br_vlan_info() barrier and start to develop the call path we have today.
But remember, br_vlan_info() still receives VLANs one by one.
Then Arkadi Sharshevsky refactored the switchdev API in 2017 in commit
29ab586c3d83 ("net: switchdev: Remove bridge bypass support from
switchdev") so that drivers would not implement .ndo_bridge_setlink any
longer. The switchdev_port_bridge_setlink also got deleted.
This refactoring removed the parallel bridge_setlink implementation from
switchdev, and left the only switchdev VLAN objects to be the ones
offloaded from __vlan_vid_add (basically RX filtering) and __vlan_add
(the latter coming from commit 9c86ce2c1ae3 ("net: bridge: Notify about
bridge VLANs")).
That is to say, today the switchdev VLAN object ranges are not used in
the kernel. Refactoring the above call path is a bit complicated, when
the bridge VLAN call path is already a bit complicated.
Let's go off and finish the job of commit 29ab586c3d83 by deleting the
bogus iteration through the VLAN ranges from the drivers. Some aspects
of this feature never made too much sense in the first place. For
example, what is a range of VLANs all having the BRIDGE_VLAN_INFO_PVID
flag supposed to mean, when a port can obviously have a single pvid?
This particular configuration _is_ denied as of commit 6623c60dc28e
("bridge: vlan: enforce no pvid flag in vlan ranges"), but from an API
perspective, the driver still has to play pretend, and only offload the
vlan->vid_end as pvid. And the addition of a switchdev VLAN object can
modify the flags of another, completely unrelated, switchdev VLAN
object! (a VLAN that is PVID will invalidate the PVID flag from whatever
other VLAN had previously been offloaded with switchdev and had that
flag. Yet switchdev never notifies about that change, drivers are
supposed to guess).
Nonetheless, having a VLAN range in the API makes error handling look
scarier than it really is - unwinding on errors and all of that.
When in reality, no one really calls this API with more than one VLAN.
It is all unnecessary complexity.
And despite appearing pretentious (two-phase transactional model and
all), the switchdev API is really sloppy because the VLAN addition and
removal operations are not paired with one another (you can add a VLAN
100 times and delete it just once). The bridge notifies through
switchdev of a VLAN addition not only when the flags of an existing VLAN
change, but also when nothing changes. There are switchdev drivers out
there who don't like adding a VLAN that has already been added, and
those checks don't really belong at driver level. But the fact that the
API contains ranges is yet another factor that prevents this from being
addressed in the future.
Of the existing switchdev pieces of hardware, it appears that only
Mellanox Spectrum supports offloading more than one VLAN at a time,
through mlxsw_sp_port_vlan_set. I have kept that code internal to the
driver, because there is some more bookkeeping that makes use of it, but
I deleted it from the switchdev API. But since the switchdev support for
ranges has already been de facto deleted by a Mellanox employee and
nobody noticed for 4 years, I'm going to assume it's not a biggie.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com> # switchdev and mlxsw
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 08:01:46 +08:00
|
|
|
u16 vid;
|
2015-09-30 00:07:18 +08:00
|
|
|
};
|
|
|
|
|
switchdev: SWITCHDEV_OBJ_PORT_{VLAN, MDB}(): Sanitize
The two macros SWITCHDEV_OBJ_PORT_VLAN() and SWITCHDEV_OBJ_PORT_MDB()
expand to a container_of() call, yielding an appropriate container of
their sole argument. However, due to a name collision, the first
argument, i.e. the contained object pointer, is not the only one to get
expanded. The third argument, which is a structure member name, and
should be kept literal, gets expanded as well. The only safe way to use
these two macros is therefore to name the local variable passed to them
"obj".
To fix this, rename the sole argument of the two macros from
"obj" (which collides with the member name) to "OBJ". Additionally,
instead of passing "OBJ" to container_of() verbatim, parenthesize it, so
that a comma in the passed-in expression doesn't pollute the
container_of() invocation.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-23 07:28:07 +08:00
|
|
|
#define SWITCHDEV_OBJ_PORT_VLAN(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_port_vlan, obj)
|
2015-10-01 17:03:45 +08:00
|
|
|
|
2016-01-11 04:06:22 +08:00
|
|
|
/* SWITCHDEV_OBJ_ID_PORT_MDB */
|
|
|
|
struct switchdev_obj_port_mdb {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
unsigned char addr[ETH_ALEN];
|
|
|
|
u16 vid;
|
|
|
|
};
|
|
|
|
|
switchdev: SWITCHDEV_OBJ_PORT_{VLAN, MDB}(): Sanitize
The two macros SWITCHDEV_OBJ_PORT_VLAN() and SWITCHDEV_OBJ_PORT_MDB()
expand to a container_of() call, yielding an appropriate container of
their sole argument. However, due to a name collision, the first
argument, i.e. the contained object pointer, is not the only one to get
expanded. The third argument, which is a structure member name, and
should be kept literal, gets expanded as well. The only safe way to use
these two macros is therefore to name the local variable passed to them
"obj".
To fix this, rename the sole argument of the two macros from
"obj" (which collides with the member name) to "OBJ". Additionally,
instead of passing "OBJ" to container_of() verbatim, parenthesize it, so
that a comma in the passed-in expression doesn't pollute the
container_of() invocation.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-23 07:28:07 +08:00
|
|
|
#define SWITCHDEV_OBJ_PORT_MDB(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_port_mdb, obj)
|
2016-01-11 04:06:22 +08:00
|
|
|
|
2020-04-26 21:22:03 +08:00
|
|
|
|
|
|
|
/* SWITCHDEV_OBJ_ID_MRP */
|
|
|
|
struct switchdev_obj_mrp {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
struct net_device *p_port;
|
|
|
|
struct net_device *s_port;
|
|
|
|
u32 ring_id;
|
2020-05-31 02:09:47 +08:00
|
|
|
u16 prio;
|
2020-04-26 21:22:03 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define SWITCHDEV_OBJ_MRP(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_mrp, obj)
|
|
|
|
|
|
|
|
/* SWITCHDEV_OBJ_ID_RING_TEST_MRP */
|
|
|
|
struct switchdev_obj_ring_test_mrp {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
/* The value is in us and a value of 0 represents to stop */
|
|
|
|
u32 interval;
|
|
|
|
u8 max_miss;
|
|
|
|
u32 ring_id;
|
|
|
|
u32 period;
|
2020-05-31 02:09:48 +08:00
|
|
|
bool monitor;
|
2020-04-26 21:22:03 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define SWITCHDEV_OBJ_RING_TEST_MRP(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_ring_test_mrp, obj)
|
|
|
|
|
|
|
|
/* SWICHDEV_OBJ_ID_RING_ROLE_MRP */
|
|
|
|
struct switchdev_obj_ring_role_mrp {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
u8 ring_role;
|
|
|
|
u32 ring_id;
|
2021-02-17 05:41:59 +08:00
|
|
|
u8 sw_backup;
|
2020-04-26 21:22:03 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define SWITCHDEV_OBJ_RING_ROLE_MRP(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_ring_role_mrp, obj)
|
|
|
|
|
|
|
|
struct switchdev_obj_ring_state_mrp {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
u8 ring_state;
|
|
|
|
u32 ring_id;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SWITCHDEV_OBJ_RING_STATE_MRP(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_ring_state_mrp, obj)
|
|
|
|
|
2020-07-14 15:34:47 +08:00
|
|
|
/* SWITCHDEV_OBJ_ID_IN_TEST_MRP */
|
|
|
|
struct switchdev_obj_in_test_mrp {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
/* The value is in us and a value of 0 represents to stop */
|
|
|
|
u32 interval;
|
|
|
|
u32 in_id;
|
|
|
|
u32 period;
|
|
|
|
u8 max_miss;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SWITCHDEV_OBJ_IN_TEST_MRP(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_in_test_mrp, obj)
|
|
|
|
|
|
|
|
/* SWICHDEV_OBJ_ID_IN_ROLE_MRP */
|
|
|
|
struct switchdev_obj_in_role_mrp {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
struct net_device *i_port;
|
|
|
|
u32 ring_id;
|
|
|
|
u16 in_id;
|
|
|
|
u8 in_role;
|
2021-02-17 05:41:59 +08:00
|
|
|
u8 sw_backup;
|
2020-07-14 15:34:47 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define SWITCHDEV_OBJ_IN_ROLE_MRP(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_in_role_mrp, obj)
|
|
|
|
|
|
|
|
struct switchdev_obj_in_state_mrp {
|
|
|
|
struct switchdev_obj obj;
|
|
|
|
u32 in_id;
|
|
|
|
u8 in_state;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SWITCHDEV_OBJ_IN_STATE_MRP(OBJ) \
|
|
|
|
container_of((OBJ), struct switchdev_obj_in_state_mrp, obj)
|
|
|
|
|
2015-10-01 17:03:45 +08:00
|
|
|
typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj);
|
|
|
|
|
net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge
With the introduction of explicit offloading API in switchdev in commit
2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge
ports are offloaded"), we started having Ethernet switch drivers calling
directly into a function exported by net/bridge/br_switchdev.c, which is
a function exported by the bridge driver.
This means that drivers that did not have an explicit dependency on the
bridge before, like cpsw and am65-cpsw, now do - otherwise it is not
possible to call a symbol exported by a driver that can be built as
module unless you are a module too.
There was an attempt to solve the dependency issue in the form of commit
b0e81817629a ("net: build all switchdev drivers as modules when the
bridge is a module"). Grygorii Strashko, however, says about it:
| In my opinion, the problem is a bit bigger here than just fixing the
| build :(
|
| In case, of ^cpsw the switchdev mode is kinda optional and in many
| cases (especially for testing purposes, NFS) the multi-mac mode is
| still preferable mode.
|
| There were no such tight dependency between switchdev drivers and
| bridge core before and switchdev serviced as independent, notification
| based layer between them, so ^cpsw still can be "Y" and bridge can be
| "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE
| will need to be set as "Y", or we will have to update drivers to
| support build with BRIDGE=n and maintain separate builds for
| networking vs non-networking testing. But is this enough? Wouldn't
| it cause 'chain reaction' required to add more and more "Y" options
| (like CONFIG_VLAN_8021Q)?
|
| PS. Just to be sure we on the same page - ARM builds will be forced
| (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our
| automation testing will just fail with omap2plus_defconfig.
In the light of this, it would be desirable for some configurations to
avoid dependencies between switchdev drivers and the bridge, and have
the switchdev mode as completely optional within the driver.
Arnd Bergmann also tried to write a patch which better expressed the
build time dependency for Ethernet switch drivers where the switchdev
support is optional, like cpsw/am65-cpsw, and this made the drivers
follow the bridge (compile as module if the bridge is a module) only if
the optional switchdev support in the driver was enabled in the first
place:
https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/
but this still did not solve the fact that cpsw and am65-cpsw now must
be built as modules when the bridge is a module - it just expressed
correctly that optional dependency. But the new behavior is an apparent
regression from Grygorii's perspective.
So to support the use case where the Ethernet driver is built-in,
NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we
need a framework that can handle the possible absence of the bridge from
the running system, i.e. runtime bloatware as opposed to build-time
bloatware.
Luckily we already have this framework, since switchdev has been using
it extensively. Events from the bridge side are transmitted to the
driver side using notifier chains - this was originally done so that
unrelated drivers could snoop for events emitted by the bridge towards
ports that are implemented by other drivers (think of a switch driver
with LAG offload that listens for switchdev events on a bonding/team
interface that it offloads).
There are also events which are transmitted from the driver side to the
bridge side, which again are modeled using notifiers.
SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with
notifying the bridge that a MAC address has been dynamically learned.
So there is a precedent we can use for modeling the new framework.
The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work
that the bridge needs to do when a port becomes offloaded is blocking in
its nature: replay VLANs, MDBs etc. The calling context is indeed
blocking (we are under rtnl_mutex), but the existing switchdev
notification chain that the bridge is subscribed to is only the atomic
one. So we need to subscribe the bridge to the blocking switchdev
notification chain too.
This patch:
- keeps the driver-side perception of the switchdev_bridge_port_{,un}offload
unchanged
- moves the implementation of switchdev_bridge_port_{,un}offload from
the bridge module into the switchdev module.
- makes everybody that is subscribed to the switchdev blocking notifier
chain "hear" offload & unoffload events
- makes the bridge driver subscribe and handle those events
- moves the bridge driver's handling of those events into 2 new
functions called br_switchdev_port_{,un}offload. These functions
contain in fact the core of the logic that was previously in
switchdev_bridge_port_{,un}offload, just that now we go through an
extra indirection layer to reach them.
Unlike all the other switchdev notification structures, the structure
used to carry the bridge port information, struct
switchdev_notifier_brport_info, does not contain a "bool handled".
This is because in the current usage pattern, we always know that a
switchdev bridge port offloading event will be handled by the bridge,
because the switchdev_bridge_port_offload() call was initiated by a
NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a
bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event
couldn't have happened.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-04 04:34:08 +08:00
|
|
|
struct switchdev_brport {
|
|
|
|
struct net_device *dev;
|
|
|
|
const void *ctx;
|
|
|
|
struct notifier_block *atomic_nb;
|
|
|
|
struct notifier_block *blocking_nb;
|
|
|
|
bool tx_fwd_offload;
|
|
|
|
};
|
|
|
|
|
2015-05-11 00:47:46 +08:00
|
|
|
enum switchdev_notifier_type {
|
2017-06-08 14:44:14 +08:00
|
|
|
SWITCHDEV_FDB_ADD_TO_BRIDGE = 1,
|
|
|
|
SWITCHDEV_FDB_DEL_TO_BRIDGE,
|
|
|
|
SWITCHDEV_FDB_ADD_TO_DEVICE,
|
|
|
|
SWITCHDEV_FDB_DEL_TO_DEVICE,
|
2017-06-08 14:44:15 +08:00
|
|
|
SWITCHDEV_FDB_OFFLOADED,
|
2020-09-11 01:23:48 +08:00
|
|
|
SWITCHDEV_FDB_FLUSH_TO_BRIDGE,
|
2018-10-17 16:53:22 +08:00
|
|
|
|
switchdev: Add SWITCHDEV_PORT_OBJ_ADD, SWITCHDEV_PORT_OBJ_DEL
An offloading driver may need to have access to switchdev events on
ports that aren't directly under its control. An example is a VXLAN port
attached to a bridge offloaded by a driver. The driver needs to know
about VLANs configured on the VXLAN device. However the VXLAN device
isn't stashed between the bridge and a front-panel-port device (such as
is the case e.g. for LAG devices), so the usual switchdev ops don't
reach the driver.
VXLAN is likely not the only device type like this: in theory any L2
tunnel device that needs offloading will prompt requirement of this
sort. This falsifies the assumption that only the lower devices of a
front panel port need to be notified to achieve flawless offloading.
A way to fix this is to give up the notion of port object addition /
deletion as a switchdev operation, which assumes somewhat tight coupling
between the message producer and consumer. And instead send the message
over a notifier chain.
To that end, introduce two new switchdev notifier types,
SWITCHDEV_PORT_OBJ_ADD and SWITCHDEV_PORT_OBJ_DEL. These notifier types
communicate the same event as the corresponding switchdev op, except in
a form of a notification. struct switchdev_notifier_port_obj_info was
added to carry the fields that the switchdev op carries. An additional
field, handled, will be used to communicate back to switchdev that the
event has reached an interested party, which will be important for the
two-phase commit.
The two switchdev operations themselves are kept in place. Following
patches first convert individual clients to the notifier protocol, and
only then are the operations removed.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-23 07:28:38 +08:00
|
|
|
SWITCHDEV_PORT_OBJ_ADD, /* Blocking. */
|
|
|
|
SWITCHDEV_PORT_OBJ_DEL, /* Blocking. */
|
2019-02-28 03:44:25 +08:00
|
|
|
SWITCHDEV_PORT_ATTR_SET, /* May be blocking . */
|
switchdev: Add SWITCHDEV_PORT_OBJ_ADD, SWITCHDEV_PORT_OBJ_DEL
An offloading driver may need to have access to switchdev events on
ports that aren't directly under its control. An example is a VXLAN port
attached to a bridge offloaded by a driver. The driver needs to know
about VLANs configured on the VXLAN device. However the VXLAN device
isn't stashed between the bridge and a front-panel-port device (such as
is the case e.g. for LAG devices), so the usual switchdev ops don't
reach the driver.
VXLAN is likely not the only device type like this: in theory any L2
tunnel device that needs offloading will prompt requirement of this
sort. This falsifies the assumption that only the lower devices of a
front panel port need to be notified to achieve flawless offloading.
A way to fix this is to give up the notion of port object addition /
deletion as a switchdev operation, which assumes somewhat tight coupling
between the message producer and consumer. And instead send the message
over a notifier chain.
To that end, introduce two new switchdev notifier types,
SWITCHDEV_PORT_OBJ_ADD and SWITCHDEV_PORT_OBJ_DEL. These notifier types
communicate the same event as the corresponding switchdev op, except in
a form of a notification. struct switchdev_notifier_port_obj_info was
added to carry the fields that the switchdev op carries. An additional
field, handled, will be used to communicate back to switchdev that the
event has reached an interested party, which will be important for the
two-phase commit.
The two switchdev operations themselves are kept in place. Following
patches first convert individual clients to the notifier protocol, and
only then are the operations removed.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-23 07:28:38 +08:00
|
|
|
|
2018-11-21 16:02:39 +08:00
|
|
|
SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE,
|
|
|
|
SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE,
|
2018-10-17 16:53:22 +08:00
|
|
|
SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
|
|
|
|
SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE,
|
2018-10-17 16:53:26 +08:00
|
|
|
SWITCHDEV_VXLAN_FDB_OFFLOADED,
|
net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge
With the introduction of explicit offloading API in switchdev in commit
2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge
ports are offloaded"), we started having Ethernet switch drivers calling
directly into a function exported by net/bridge/br_switchdev.c, which is
a function exported by the bridge driver.
This means that drivers that did not have an explicit dependency on the
bridge before, like cpsw and am65-cpsw, now do - otherwise it is not
possible to call a symbol exported by a driver that can be built as
module unless you are a module too.
There was an attempt to solve the dependency issue in the form of commit
b0e81817629a ("net: build all switchdev drivers as modules when the
bridge is a module"). Grygorii Strashko, however, says about it:
| In my opinion, the problem is a bit bigger here than just fixing the
| build :(
|
| In case, of ^cpsw the switchdev mode is kinda optional and in many
| cases (especially for testing purposes, NFS) the multi-mac mode is
| still preferable mode.
|
| There were no such tight dependency between switchdev drivers and
| bridge core before and switchdev serviced as independent, notification
| based layer between them, so ^cpsw still can be "Y" and bridge can be
| "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE
| will need to be set as "Y", or we will have to update drivers to
| support build with BRIDGE=n and maintain separate builds for
| networking vs non-networking testing. But is this enough? Wouldn't
| it cause 'chain reaction' required to add more and more "Y" options
| (like CONFIG_VLAN_8021Q)?
|
| PS. Just to be sure we on the same page - ARM builds will be forced
| (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our
| automation testing will just fail with omap2plus_defconfig.
In the light of this, it would be desirable for some configurations to
avoid dependencies between switchdev drivers and the bridge, and have
the switchdev mode as completely optional within the driver.
Arnd Bergmann also tried to write a patch which better expressed the
build time dependency for Ethernet switch drivers where the switchdev
support is optional, like cpsw/am65-cpsw, and this made the drivers
follow the bridge (compile as module if the bridge is a module) only if
the optional switchdev support in the driver was enabled in the first
place:
https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/
but this still did not solve the fact that cpsw and am65-cpsw now must
be built as modules when the bridge is a module - it just expressed
correctly that optional dependency. But the new behavior is an apparent
regression from Grygorii's perspective.
So to support the use case where the Ethernet driver is built-in,
NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we
need a framework that can handle the possible absence of the bridge from
the running system, i.e. runtime bloatware as opposed to build-time
bloatware.
Luckily we already have this framework, since switchdev has been using
it extensively. Events from the bridge side are transmitted to the
driver side using notifier chains - this was originally done so that
unrelated drivers could snoop for events emitted by the bridge towards
ports that are implemented by other drivers (think of a switch driver
with LAG offload that listens for switchdev events on a bonding/team
interface that it offloads).
There are also events which are transmitted from the driver side to the
bridge side, which again are modeled using notifiers.
SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with
notifying the bridge that a MAC address has been dynamically learned.
So there is a precedent we can use for modeling the new framework.
The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work
that the bridge needs to do when a port becomes offloaded is blocking in
its nature: replay VLANs, MDBs etc. The calling context is indeed
blocking (we are under rtnl_mutex), but the existing switchdev
notification chain that the bridge is subscribed to is only the atomic
one. So we need to subscribe the bridge to the blocking switchdev
notification chain too.
This patch:
- keeps the driver-side perception of the switchdev_bridge_port_{,un}offload
unchanged
- moves the implementation of switchdev_bridge_port_{,un}offload from
the bridge module into the switchdev module.
- makes everybody that is subscribed to the switchdev blocking notifier
chain "hear" offload & unoffload events
- makes the bridge driver subscribe and handle those events
- moves the bridge driver's handling of those events into 2 new
functions called br_switchdev_port_{,un}offload. These functions
contain in fact the core of the logic that was previously in
switchdev_bridge_port_{,un}offload, just that now we go through an
extra indirection layer to reach them.
Unlike all the other switchdev notification structures, the structure
used to carry the bridge port information, struct
switchdev_notifier_brport_info, does not contain a "bool handled".
This is because in the current usage pattern, we always know that a
switchdev bridge port offloading event will be handled by the bridge,
because the switchdev_bridge_port_offload() call was initiated by a
NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a
bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event
couldn't have happened.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-04 04:34:08 +08:00
|
|
|
|
|
|
|
SWITCHDEV_BRPORT_OFFLOADED,
|
|
|
|
SWITCHDEV_BRPORT_UNOFFLOADED,
|
2015-01-16 06:49:37 +08:00
|
|
|
};
|
|
|
|
|
2015-05-11 00:47:46 +08:00
|
|
|
struct switchdev_notifier_info {
|
2015-01-16 06:49:36 +08:00
|
|
|
struct net_device *dev;
|
2018-12-13 01:02:54 +08:00
|
|
|
struct netlink_ext_ack *extack;
|
2021-06-27 19:54:24 +08:00
|
|
|
const void *ctx;
|
2015-01-16 06:49:36 +08:00
|
|
|
};
|
|
|
|
|
2015-05-11 00:47:46 +08:00
|
|
|
struct switchdev_notifier_fdb_info {
|
|
|
|
struct switchdev_notifier_info info; /* must be first */
|
2015-01-16 06:49:37 +08:00
|
|
|
const unsigned char *addr;
|
|
|
|
u16 vid;
|
2018-10-17 16:53:29 +08:00
|
|
|
u8 added_by_user:1,
|
2021-04-15 00:52:56 +08:00
|
|
|
is_local:1,
|
2018-10-17 16:53:29 +08:00
|
|
|
offloaded:1;
|
2015-01-16 06:49:37 +08:00
|
|
|
};
|
|
|
|
|
switchdev: Add SWITCHDEV_PORT_OBJ_ADD, SWITCHDEV_PORT_OBJ_DEL
An offloading driver may need to have access to switchdev events on
ports that aren't directly under its control. An example is a VXLAN port
attached to a bridge offloaded by a driver. The driver needs to know
about VLANs configured on the VXLAN device. However the VXLAN device
isn't stashed between the bridge and a front-panel-port device (such as
is the case e.g. for LAG devices), so the usual switchdev ops don't
reach the driver.
VXLAN is likely not the only device type like this: in theory any L2
tunnel device that needs offloading will prompt requirement of this
sort. This falsifies the assumption that only the lower devices of a
front panel port need to be notified to achieve flawless offloading.
A way to fix this is to give up the notion of port object addition /
deletion as a switchdev operation, which assumes somewhat tight coupling
between the message producer and consumer. And instead send the message
over a notifier chain.
To that end, introduce two new switchdev notifier types,
SWITCHDEV_PORT_OBJ_ADD and SWITCHDEV_PORT_OBJ_DEL. These notifier types
communicate the same event as the corresponding switchdev op, except in
a form of a notification. struct switchdev_notifier_port_obj_info was
added to carry the fields that the switchdev op carries. An additional
field, handled, will be used to communicate back to switchdev that the
event has reached an interested party, which will be important for the
two-phase commit.
The two switchdev operations themselves are kept in place. Following
patches first convert individual clients to the notifier protocol, and
only then are the operations removed.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-23 07:28:38 +08:00
|
|
|
struct switchdev_notifier_port_obj_info {
|
|
|
|
struct switchdev_notifier_info info; /* must be first */
|
|
|
|
const struct switchdev_obj *obj;
|
|
|
|
bool handled;
|
|
|
|
};
|
|
|
|
|
2019-02-28 03:44:25 +08:00
|
|
|
struct switchdev_notifier_port_attr_info {
|
|
|
|
struct switchdev_notifier_info info; /* must be first */
|
|
|
|
const struct switchdev_attr *attr;
|
|
|
|
bool handled;
|
|
|
|
};
|
|
|
|
|
net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge
With the introduction of explicit offloading API in switchdev in commit
2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge
ports are offloaded"), we started having Ethernet switch drivers calling
directly into a function exported by net/bridge/br_switchdev.c, which is
a function exported by the bridge driver.
This means that drivers that did not have an explicit dependency on the
bridge before, like cpsw and am65-cpsw, now do - otherwise it is not
possible to call a symbol exported by a driver that can be built as
module unless you are a module too.
There was an attempt to solve the dependency issue in the form of commit
b0e81817629a ("net: build all switchdev drivers as modules when the
bridge is a module"). Grygorii Strashko, however, says about it:
| In my opinion, the problem is a bit bigger here than just fixing the
| build :(
|
| In case, of ^cpsw the switchdev mode is kinda optional and in many
| cases (especially for testing purposes, NFS) the multi-mac mode is
| still preferable mode.
|
| There were no such tight dependency between switchdev drivers and
| bridge core before and switchdev serviced as independent, notification
| based layer between them, so ^cpsw still can be "Y" and bridge can be
| "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE
| will need to be set as "Y", or we will have to update drivers to
| support build with BRIDGE=n and maintain separate builds for
| networking vs non-networking testing. But is this enough? Wouldn't
| it cause 'chain reaction' required to add more and more "Y" options
| (like CONFIG_VLAN_8021Q)?
|
| PS. Just to be sure we on the same page - ARM builds will be forced
| (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our
| automation testing will just fail with omap2plus_defconfig.
In the light of this, it would be desirable for some configurations to
avoid dependencies between switchdev drivers and the bridge, and have
the switchdev mode as completely optional within the driver.
Arnd Bergmann also tried to write a patch which better expressed the
build time dependency for Ethernet switch drivers where the switchdev
support is optional, like cpsw/am65-cpsw, and this made the drivers
follow the bridge (compile as module if the bridge is a module) only if
the optional switchdev support in the driver was enabled in the first
place:
https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/
but this still did not solve the fact that cpsw and am65-cpsw now must
be built as modules when the bridge is a module - it just expressed
correctly that optional dependency. But the new behavior is an apparent
regression from Grygorii's perspective.
So to support the use case where the Ethernet driver is built-in,
NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we
need a framework that can handle the possible absence of the bridge from
the running system, i.e. runtime bloatware as opposed to build-time
bloatware.
Luckily we already have this framework, since switchdev has been using
it extensively. Events from the bridge side are transmitted to the
driver side using notifier chains - this was originally done so that
unrelated drivers could snoop for events emitted by the bridge towards
ports that are implemented by other drivers (think of a switch driver
with LAG offload that listens for switchdev events on a bonding/team
interface that it offloads).
There are also events which are transmitted from the driver side to the
bridge side, which again are modeled using notifiers.
SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with
notifying the bridge that a MAC address has been dynamically learned.
So there is a precedent we can use for modeling the new framework.
The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work
that the bridge needs to do when a port becomes offloaded is blocking in
its nature: replay VLANs, MDBs etc. The calling context is indeed
blocking (we are under rtnl_mutex), but the existing switchdev
notification chain that the bridge is subscribed to is only the atomic
one. So we need to subscribe the bridge to the blocking switchdev
notification chain too.
This patch:
- keeps the driver-side perception of the switchdev_bridge_port_{,un}offload
unchanged
- moves the implementation of switchdev_bridge_port_{,un}offload from
the bridge module into the switchdev module.
- makes everybody that is subscribed to the switchdev blocking notifier
chain "hear" offload & unoffload events
- makes the bridge driver subscribe and handle those events
- moves the bridge driver's handling of those events into 2 new
functions called br_switchdev_port_{,un}offload. These functions
contain in fact the core of the logic that was previously in
switchdev_bridge_port_{,un}offload, just that now we go through an
extra indirection layer to reach them.
Unlike all the other switchdev notification structures, the structure
used to carry the bridge port information, struct
switchdev_notifier_brport_info, does not contain a "bool handled".
This is because in the current usage pattern, we always know that a
switchdev bridge port offloading event will be handled by the bridge,
because the switchdev_bridge_port_offload() call was initiated by a
NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a
bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event
couldn't have happened.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-04 04:34:08 +08:00
|
|
|
struct switchdev_notifier_brport_info {
|
|
|
|
struct switchdev_notifier_info info; /* must be first */
|
|
|
|
const struct switchdev_brport brport;
|
|
|
|
};
|
|
|
|
|
2015-01-16 06:49:36 +08:00
|
|
|
static inline struct net_device *
|
2015-05-11 00:47:46 +08:00
|
|
|
switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info)
|
2015-01-16 06:49:36 +08:00
|
|
|
{
|
|
|
|
return info->dev;
|
|
|
|
}
|
2014-11-28 21:34:17 +08:00
|
|
|
|
2018-12-13 01:02:54 +08:00
|
|
|
static inline struct netlink_ext_ack *
|
|
|
|
switchdev_notifier_info_to_extack(const struct switchdev_notifier_info *info)
|
|
|
|
{
|
|
|
|
return info->extack;
|
|
|
|
}
|
|
|
|
|
2021-07-19 21:51:38 +08:00
|
|
|
static inline bool
|
|
|
|
switchdev_fdb_is_dynamically_learned(const struct switchdev_notifier_fdb_info *fdb_info)
|
|
|
|
{
|
|
|
|
return !fdb_info->added_by_user && !fdb_info->is_local;
|
|
|
|
}
|
|
|
|
|
2014-11-28 21:34:17 +08:00
|
|
|
#ifdef CONFIG_NET_SWITCHDEV
|
|
|
|
|
net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge
With the introduction of explicit offloading API in switchdev in commit
2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge
ports are offloaded"), we started having Ethernet switch drivers calling
directly into a function exported by net/bridge/br_switchdev.c, which is
a function exported by the bridge driver.
This means that drivers that did not have an explicit dependency on the
bridge before, like cpsw and am65-cpsw, now do - otherwise it is not
possible to call a symbol exported by a driver that can be built as
module unless you are a module too.
There was an attempt to solve the dependency issue in the form of commit
b0e81817629a ("net: build all switchdev drivers as modules when the
bridge is a module"). Grygorii Strashko, however, says about it:
| In my opinion, the problem is a bit bigger here than just fixing the
| build :(
|
| In case, of ^cpsw the switchdev mode is kinda optional and in many
| cases (especially for testing purposes, NFS) the multi-mac mode is
| still preferable mode.
|
| There were no such tight dependency between switchdev drivers and
| bridge core before and switchdev serviced as independent, notification
| based layer between them, so ^cpsw still can be "Y" and bridge can be
| "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE
| will need to be set as "Y", or we will have to update drivers to
| support build with BRIDGE=n and maintain separate builds for
| networking vs non-networking testing. But is this enough? Wouldn't
| it cause 'chain reaction' required to add more and more "Y" options
| (like CONFIG_VLAN_8021Q)?
|
| PS. Just to be sure we on the same page - ARM builds will be forced
| (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our
| automation testing will just fail with omap2plus_defconfig.
In the light of this, it would be desirable for some configurations to
avoid dependencies between switchdev drivers and the bridge, and have
the switchdev mode as completely optional within the driver.
Arnd Bergmann also tried to write a patch which better expressed the
build time dependency for Ethernet switch drivers where the switchdev
support is optional, like cpsw/am65-cpsw, and this made the drivers
follow the bridge (compile as module if the bridge is a module) only if
the optional switchdev support in the driver was enabled in the first
place:
https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/
but this still did not solve the fact that cpsw and am65-cpsw now must
be built as modules when the bridge is a module - it just expressed
correctly that optional dependency. But the new behavior is an apparent
regression from Grygorii's perspective.
So to support the use case where the Ethernet driver is built-in,
NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we
need a framework that can handle the possible absence of the bridge from
the running system, i.e. runtime bloatware as opposed to build-time
bloatware.
Luckily we already have this framework, since switchdev has been using
it extensively. Events from the bridge side are transmitted to the
driver side using notifier chains - this was originally done so that
unrelated drivers could snoop for events emitted by the bridge towards
ports that are implemented by other drivers (think of a switch driver
with LAG offload that listens for switchdev events on a bonding/team
interface that it offloads).
There are also events which are transmitted from the driver side to the
bridge side, which again are modeled using notifiers.
SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with
notifying the bridge that a MAC address has been dynamically learned.
So there is a precedent we can use for modeling the new framework.
The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work
that the bridge needs to do when a port becomes offloaded is blocking in
its nature: replay VLANs, MDBs etc. The calling context is indeed
blocking (we are under rtnl_mutex), but the existing switchdev
notification chain that the bridge is subscribed to is only the atomic
one. So we need to subscribe the bridge to the blocking switchdev
notification chain too.
This patch:
- keeps the driver-side perception of the switchdev_bridge_port_{,un}offload
unchanged
- moves the implementation of switchdev_bridge_port_{,un}offload from
the bridge module into the switchdev module.
- makes everybody that is subscribed to the switchdev blocking notifier
chain "hear" offload & unoffload events
- makes the bridge driver subscribe and handle those events
- moves the bridge driver's handling of those events into 2 new
functions called br_switchdev_port_{,un}offload. These functions
contain in fact the core of the logic that was previously in
switchdev_bridge_port_{,un}offload, just that now we go through an
extra indirection layer to reach them.
Unlike all the other switchdev notification structures, the structure
used to carry the bridge port information, struct
switchdev_notifier_brport_info, does not contain a "bool handled".
This is because in the current usage pattern, we always know that a
switchdev bridge port offloading event will be handled by the bridge,
because the switchdev_bridge_port_offload() call was initiated by a
NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a
bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event
couldn't have happened.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-04 04:34:08 +08:00
|
|
|
int switchdev_bridge_port_offload(struct net_device *brport_dev,
|
|
|
|
struct net_device *dev, const void *ctx,
|
|
|
|
struct notifier_block *atomic_nb,
|
|
|
|
struct notifier_block *blocking_nb,
|
|
|
|
bool tx_fwd_offload,
|
|
|
|
struct netlink_ext_ack *extack);
|
|
|
|
void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
|
|
|
|
const void *ctx,
|
|
|
|
struct notifier_block *atomic_nb,
|
|
|
|
struct notifier_block *blocking_nb);
|
|
|
|
|
2015-10-15 01:40:48 +08:00
|
|
|
void switchdev_deferred_process(void);
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
int switchdev_port_attr_set(struct net_device *dev,
|
2021-02-14 04:43:17 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack);
|
2015-10-01 17:03:46 +08:00
|
|
|
int switchdev_port_obj_add(struct net_device *dev,
|
2018-12-13 01:02:52 +08:00
|
|
|
const struct switchdev_obj *obj,
|
|
|
|
struct netlink_ext_ack *extack);
|
2015-10-01 17:03:46 +08:00
|
|
|
int switchdev_port_obj_del(struct net_device *dev,
|
2015-10-01 17:03:45 +08:00
|
|
|
const struct switchdev_obj *obj);
|
2018-11-23 07:28:25 +08:00
|
|
|
|
2015-05-11 00:47:46 +08:00
|
|
|
int register_switchdev_notifier(struct notifier_block *nb);
|
|
|
|
int unregister_switchdev_notifier(struct notifier_block *nb);
|
|
|
|
int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
|
2019-01-17 07:06:56 +08:00
|
|
|
struct switchdev_notifier_info *info,
|
|
|
|
struct netlink_ext_ack *extack);
|
2018-11-23 07:28:25 +08:00
|
|
|
|
|
|
|
int register_switchdev_blocking_notifier(struct notifier_block *nb);
|
|
|
|
int unregister_switchdev_blocking_notifier(struct notifier_block *nb);
|
|
|
|
int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev,
|
2018-12-13 01:02:54 +08:00
|
|
|
struct switchdev_notifier_info *info,
|
|
|
|
struct netlink_ext_ack *extack);
|
2018-11-23 07:28:25 +08:00
|
|
|
|
2015-07-19 09:24:50 +08:00
|
|
|
void switchdev_port_fwd_mark_set(struct net_device *dev,
|
|
|
|
struct net_device *group_dev,
|
|
|
|
bool joining);
|
2015-03-06 13:21:15 +08:00
|
|
|
|
2021-10-26 22:27:43 +08:00
|
|
|
int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event,
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
2021-10-26 22:27:43 +08:00
|
|
|
int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev,
|
|
|
|
unsigned long event, const void *ctx,
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info),
|
2021-10-26 22:27:43 +08:00
|
|
|
int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev,
|
|
|
|
unsigned long event, const void *ctx,
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info));
|
|
|
|
|
2018-11-23 07:29:44 +08:00
|
|
|
int switchdev_handle_port_obj_add(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*add_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj,
|
2018-12-13 01:02:56 +08:00
|
|
|
struct netlink_ext_ack *extack));
|
2018-11-23 07:29:44 +08:00
|
|
|
int switchdev_handle_port_obj_del(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*del_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj));
|
|
|
|
|
2019-02-28 03:44:25 +08:00
|
|
|
int switchdev_handle_port_attr_set(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_attr_info *port_attr_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*set_cb)(struct net_device *dev, const void *ctx,
|
2021-02-12 23:15:51 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack));
|
2014-11-28 21:34:17 +08:00
|
|
|
#else
|
|
|
|
|
net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge
With the introduction of explicit offloading API in switchdev in commit
2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge
ports are offloaded"), we started having Ethernet switch drivers calling
directly into a function exported by net/bridge/br_switchdev.c, which is
a function exported by the bridge driver.
This means that drivers that did not have an explicit dependency on the
bridge before, like cpsw and am65-cpsw, now do - otherwise it is not
possible to call a symbol exported by a driver that can be built as
module unless you are a module too.
There was an attempt to solve the dependency issue in the form of commit
b0e81817629a ("net: build all switchdev drivers as modules when the
bridge is a module"). Grygorii Strashko, however, says about it:
| In my opinion, the problem is a bit bigger here than just fixing the
| build :(
|
| In case, of ^cpsw the switchdev mode is kinda optional and in many
| cases (especially for testing purposes, NFS) the multi-mac mode is
| still preferable mode.
|
| There were no such tight dependency between switchdev drivers and
| bridge core before and switchdev serviced as independent, notification
| based layer between them, so ^cpsw still can be "Y" and bridge can be
| "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE
| will need to be set as "Y", or we will have to update drivers to
| support build with BRIDGE=n and maintain separate builds for
| networking vs non-networking testing. But is this enough? Wouldn't
| it cause 'chain reaction' required to add more and more "Y" options
| (like CONFIG_VLAN_8021Q)?
|
| PS. Just to be sure we on the same page - ARM builds will be forced
| (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our
| automation testing will just fail with omap2plus_defconfig.
In the light of this, it would be desirable for some configurations to
avoid dependencies between switchdev drivers and the bridge, and have
the switchdev mode as completely optional within the driver.
Arnd Bergmann also tried to write a patch which better expressed the
build time dependency for Ethernet switch drivers where the switchdev
support is optional, like cpsw/am65-cpsw, and this made the drivers
follow the bridge (compile as module if the bridge is a module) only if
the optional switchdev support in the driver was enabled in the first
place:
https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/
but this still did not solve the fact that cpsw and am65-cpsw now must
be built as modules when the bridge is a module - it just expressed
correctly that optional dependency. But the new behavior is an apparent
regression from Grygorii's perspective.
So to support the use case where the Ethernet driver is built-in,
NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we
need a framework that can handle the possible absence of the bridge from
the running system, i.e. runtime bloatware as opposed to build-time
bloatware.
Luckily we already have this framework, since switchdev has been using
it extensively. Events from the bridge side are transmitted to the
driver side using notifier chains - this was originally done so that
unrelated drivers could snoop for events emitted by the bridge towards
ports that are implemented by other drivers (think of a switch driver
with LAG offload that listens for switchdev events on a bonding/team
interface that it offloads).
There are also events which are transmitted from the driver side to the
bridge side, which again are modeled using notifiers.
SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with
notifying the bridge that a MAC address has been dynamically learned.
So there is a precedent we can use for modeling the new framework.
The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work
that the bridge needs to do when a port becomes offloaded is blocking in
its nature: replay VLANs, MDBs etc. The calling context is indeed
blocking (we are under rtnl_mutex), but the existing switchdev
notification chain that the bridge is subscribed to is only the atomic
one. So we need to subscribe the bridge to the blocking switchdev
notification chain too.
This patch:
- keeps the driver-side perception of the switchdev_bridge_port_{,un}offload
unchanged
- moves the implementation of switchdev_bridge_port_{,un}offload from
the bridge module into the switchdev module.
- makes everybody that is subscribed to the switchdev blocking notifier
chain "hear" offload & unoffload events
- makes the bridge driver subscribe and handle those events
- moves the bridge driver's handling of those events into 2 new
functions called br_switchdev_port_{,un}offload. These functions
contain in fact the core of the logic that was previously in
switchdev_bridge_port_{,un}offload, just that now we go through an
extra indirection layer to reach them.
Unlike all the other switchdev notification structures, the structure
used to carry the bridge port information, struct
switchdev_notifier_brport_info, does not contain a "bool handled".
This is because in the current usage pattern, we always know that a
switchdev bridge port offloading event will be handled by the bridge,
because the switchdev_bridge_port_offload() call was initiated by a
NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a
bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event
couldn't have happened.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-04 04:34:08 +08:00
|
|
|
static inline int
|
|
|
|
switchdev_bridge_port_offload(struct net_device *brport_dev,
|
|
|
|
struct net_device *dev, const void *ctx,
|
|
|
|
struct notifier_block *atomic_nb,
|
|
|
|
struct notifier_block *blocking_nb,
|
|
|
|
bool tx_fwd_offload,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
switchdev_bridge_port_unoffload(struct net_device *brport_dev,
|
|
|
|
const void *ctx,
|
|
|
|
struct notifier_block *atomic_nb,
|
|
|
|
struct notifier_block *blocking_nb)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-10-15 01:40:48 +08:00
|
|
|
static inline void switchdev_deferred_process(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
static inline int switchdev_port_attr_set(struct net_device *dev,
|
2021-02-16 05:09:11 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack)
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2015-05-11 00:47:52 +08:00
|
|
|
static inline int switchdev_port_obj_add(struct net_device *dev,
|
2018-12-13 01:02:52 +08:00
|
|
|
const struct switchdev_obj *obj,
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-05-11 00:47:52 +08:00
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int switchdev_port_obj_del(struct net_device *dev,
|
2015-10-01 17:03:45 +08:00
|
|
|
const struct switchdev_obj *obj)
|
2015-05-11 00:47:52 +08:00
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2015-05-11 00:47:46 +08:00
|
|
|
static inline int register_switchdev_notifier(struct notifier_block *nb)
|
2015-01-16 06:49:36 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-11 00:47:46 +08:00
|
|
|
static inline int unregister_switchdev_notifier(struct notifier_block *nb)
|
2015-01-16 06:49:36 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-11 00:47:46 +08:00
|
|
|
static inline int call_switchdev_notifiers(unsigned long val,
|
|
|
|
struct net_device *dev,
|
2019-01-17 07:06:56 +08:00
|
|
|
struct switchdev_notifier_info *info,
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-01-16 06:49:36 +08:00
|
|
|
{
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
2018-11-23 07:28:25 +08:00
|
|
|
static inline int
|
|
|
|
register_switchdev_blocking_notifier(struct notifier_block *nb)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
unregister_switchdev_blocking_notifier(struct notifier_block *nb)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
call_switchdev_blocking_notifiers(unsigned long val,
|
|
|
|
struct net_device *dev,
|
2018-12-13 01:02:54 +08:00
|
|
|
struct switchdev_notifier_info *info,
|
|
|
|
struct netlink_ext_ack *extack)
|
2018-11-23 07:28:25 +08:00
|
|
|
{
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
2018-11-23 07:29:44 +08:00
|
|
|
static inline int
|
2021-10-26 22:27:43 +08:00
|
|
|
switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event,
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
2021-10-26 22:27:43 +08:00
|
|
|
int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev,
|
|
|
|
unsigned long event, const void *ctx,
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info),
|
2021-10-26 22:27:43 +08:00
|
|
|
int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev,
|
|
|
|
unsigned long event, const void *ctx,
|
2021-07-21 01:35:56 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info))
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
2018-11-23 07:29:44 +08:00
|
|
|
switchdev_handle_port_obj_add(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*add_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj,
|
2018-12-13 01:02:56 +08:00
|
|
|
struct netlink_ext_ack *extack))
|
2018-11-23 07:29:44 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
switchdev_handle_port_obj_del(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*del_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj))
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-02-28 03:44:25 +08:00
|
|
|
static inline int
|
|
|
|
switchdev_handle_port_attr_set(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_attr_info *port_attr_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*set_cb)(struct net_device *dev, const void *ctx,
|
2021-02-12 23:15:51 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack))
|
2019-02-28 03:44:25 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2014-11-28 21:34:17 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _LINUX_SWITCHDEV_H_ */
|