2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2014-11-28 21:34:17 +08:00
|
|
|
/*
|
|
|
|
* net/switchdev/switchdev.c - Switch device API
|
2015-09-24 16:02:41 +08:00
|
|
|
* Copyright (c) 2014-2015 Jiri Pirko <jiri@resnulli.us>
|
2015-03-10 04:59:09 +08:00
|
|
|
* Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com>
|
2014-11-28 21:34:17 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/init.h>
|
2015-01-16 06:49:36 +08:00
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/notifier.h>
|
2014-11-28 21:34:17 +08:00
|
|
|
#include <linux/netdevice.h>
|
2015-10-15 01:40:51 +08:00
|
|
|
#include <linux/etherdevice.h>
|
2015-05-11 00:47:56 +08:00
|
|
|
#include <linux/if_bridge.h>
|
2015-09-24 16:02:41 +08:00
|
|
|
#include <linux/list.h>
|
2015-10-15 01:40:48 +08:00
|
|
|
#include <linux/workqueue.h>
|
2015-10-12 20:31:01 +08:00
|
|
|
#include <linux/if_vlan.h>
|
2016-01-27 22:16:43 +08:00
|
|
|
#include <linux/rtnetlink.h>
|
2014-11-28 21:34:17 +08:00
|
|
|
#include <net/switchdev.h>
|
|
|
|
|
net: bridge: switchdev: Skip MDB replays of deferred events on offload
[ Upstream commit dc489f86257cab5056e747344f17a164f63bff4b ]
Before this change, generation of the list of MDB events to replay
would race against the creation of new group memberships, either from
the IGMP/MLD snooping logic or from user configuration.
While new memberships are immediately visible to walkers of
br->mdb_list, the notification of their existence to switchdev event
subscribers is deferred until a later point in time. So if a replay
list was generated during a time that overlapped with such a window,
it would also contain a replay of the not-yet-delivered event.
The driver would thus receive two copies of what the bridge internally
considered to be one single event. On destruction of the bridge, only
a single membership deletion event was therefore sent. As a
consequence of this, drivers which reference count memberships (at
least DSA), would be left with orphan groups in their hardware
database when the bridge was destroyed.
This is only an issue when replaying additions. While deletion events
may still be pending on the deferred queue, they will already have
been removed from br->mdb_list, so no duplicates can be generated in
that scenario.
To a user this meant that old group memberships, from a bridge in
which a port was previously attached, could be reanimated (in
hardware) when the port joined a new bridge, without the new bridge's
knowledge.
For example, on an mv88e6xxx system, create a snooping bridge and
immediately add a port to it:
root@infix-06-0b-00:~$ ip link add dev br0 up type bridge mcast_snooping 1 && \
> ip link set dev x3 up master br0
And then destroy the bridge:
root@infix-06-0b-00:~$ ip link del dev br0
root@infix-06-0b-00:~$ mvls atu
ADDRESS FID STATE Q F 0 1 2 3 4 5 6 7 8 9 a
DEV:0 Marvell 88E6393X
33:33:00:00:00:6a 1 static - - 0 . . . . . . . . . .
33:33:ff:87:e4:3f 1 static - - 0 . . . . . . . . . .
ff:ff:ff:ff:ff:ff 1 static - - 0 1 2 3 4 5 6 7 8 9 a
root@infix-06-0b-00:~$
The two IPv6 groups remain in the hardware database because the
port (x3) is notified of the host's membership twice: once via the
original event and once via a replay. Since only a single delete
notification is sent, the count remains at 1 when the bridge is
destroyed.
Then add the same port (or another port belonging to the same hardware
domain) to a new bridge, this time with snooping disabled:
root@infix-06-0b-00:~$ ip link add dev br1 up type bridge mcast_snooping 0 && \
> ip link set dev x3 up master br1
All multicast, including the two IPv6 groups from br0, should now be
flooded, according to the policy of br1. But instead the old
memberships are still active in the hardware database, causing the
switch to only forward traffic to those groups towards the CPU (port
0).
Eliminate the race in two steps:
1. Grab the write-side lock of the MDB while generating the replay
list.
This prevents new memberships from showing up while we are generating
the replay list. But it leaves the scenario in which a deferred event
was already generated, but not delivered, before we grabbed the
lock. Therefore:
2. Make sure that no deferred version of a replay event is already
enqueued to the switchdev deferred queue, before adding it to the
replay list, when replaying additions.
Fixes: 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries")
Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-15 05:40:03 +08:00
|
|
|
static bool switchdev_obj_eq(const struct switchdev_obj *a,
|
|
|
|
const struct switchdev_obj *b)
|
|
|
|
{
|
|
|
|
const struct switchdev_obj_port_vlan *va, *vb;
|
|
|
|
const struct switchdev_obj_port_mdb *ma, *mb;
|
|
|
|
|
|
|
|
if (a->id != b->id || a->orig_dev != b->orig_dev)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (a->id) {
|
|
|
|
case SWITCHDEV_OBJ_ID_PORT_VLAN:
|
|
|
|
va = SWITCHDEV_OBJ_PORT_VLAN(a);
|
|
|
|
vb = SWITCHDEV_OBJ_PORT_VLAN(b);
|
|
|
|
return va->flags == vb->flags &&
|
|
|
|
va->vid == vb->vid &&
|
|
|
|
va->changed == vb->changed;
|
|
|
|
case SWITCHDEV_OBJ_ID_PORT_MDB:
|
|
|
|
case SWITCHDEV_OBJ_ID_HOST_MDB:
|
|
|
|
ma = SWITCHDEV_OBJ_PORT_MDB(a);
|
|
|
|
mb = SWITCHDEV_OBJ_PORT_MDB(b);
|
|
|
|
return ma->vid == mb->vid &&
|
|
|
|
ether_addr_equal(ma->addr, mb->addr);
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2015-10-15 01:40:48 +08:00
|
|
|
static LIST_HEAD(deferred);
|
|
|
|
static DEFINE_SPINLOCK(deferred_lock);
|
|
|
|
|
|
|
|
typedef void switchdev_deferred_func_t(struct net_device *dev,
|
|
|
|
const void *data);
|
|
|
|
|
|
|
|
struct switchdev_deferred_item {
|
|
|
|
struct list_head list;
|
|
|
|
struct net_device *dev;
|
2021-12-07 09:30:31 +08:00
|
|
|
netdevice_tracker dev_tracker;
|
2015-10-15 01:40:48 +08:00
|
|
|
switchdev_deferred_func_t *func;
|
2020-02-18 04:02:36 +08:00
|
|
|
unsigned long data[];
|
2015-10-15 01:40:48 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct switchdev_deferred_item *switchdev_deferred_dequeue(void)
|
|
|
|
{
|
|
|
|
struct switchdev_deferred_item *dfitem;
|
|
|
|
|
|
|
|
spin_lock_bh(&deferred_lock);
|
|
|
|
if (list_empty(&deferred)) {
|
|
|
|
dfitem = NULL;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
dfitem = list_first_entry(&deferred,
|
|
|
|
struct switchdev_deferred_item, list);
|
|
|
|
list_del(&dfitem->list);
|
|
|
|
unlock:
|
|
|
|
spin_unlock_bh(&deferred_lock);
|
|
|
|
return dfitem;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* switchdev_deferred_process - Process ops in deferred queue
|
|
|
|
*
|
|
|
|
* Called to flush the ops currently queued in deferred ops queue.
|
|
|
|
* rtnl_lock must be held.
|
|
|
|
*/
|
|
|
|
void switchdev_deferred_process(void)
|
|
|
|
{
|
|
|
|
struct switchdev_deferred_item *dfitem;
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
|
|
|
|
|
|
|
while ((dfitem = switchdev_deferred_dequeue())) {
|
|
|
|
dfitem->func(dfitem->dev, dfitem->data);
|
2022-06-08 12:39:55 +08:00
|
|
|
netdev_put(dfitem->dev, &dfitem->dev_tracker);
|
2015-10-15 01:40:48 +08:00
|
|
|
kfree(dfitem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_deferred_process);
|
|
|
|
|
|
|
|
static void switchdev_deferred_process_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
rtnl_lock();
|
|
|
|
switchdev_deferred_process();
|
|
|
|
rtnl_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static DECLARE_WORK(deferred_process_work, switchdev_deferred_process_work);
|
|
|
|
|
|
|
|
static int switchdev_deferred_enqueue(struct net_device *dev,
|
|
|
|
const void *data, size_t data_len,
|
|
|
|
switchdev_deferred_func_t *func)
|
|
|
|
{
|
|
|
|
struct switchdev_deferred_item *dfitem;
|
|
|
|
|
2022-02-10 14:10:08 +08:00
|
|
|
dfitem = kmalloc(struct_size(dfitem, data, data_len), GFP_ATOMIC);
|
2015-10-15 01:40:48 +08:00
|
|
|
if (!dfitem)
|
|
|
|
return -ENOMEM;
|
|
|
|
dfitem->dev = dev;
|
|
|
|
dfitem->func = func;
|
|
|
|
memcpy(dfitem->data, data, data_len);
|
2022-06-08 12:39:55 +08:00
|
|
|
netdev_hold(dev, &dfitem->dev_tracker, GFP_ATOMIC);
|
2015-10-15 01:40:48 +08:00
|
|
|
spin_lock_bh(&deferred_lock);
|
|
|
|
list_add_tail(&dfitem->list, &deferred);
|
|
|
|
spin_unlock_bh(&deferred_lock);
|
|
|
|
schedule_work(&deferred_process_work);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-02-28 03:44:31 +08:00
|
|
|
static int switchdev_port_attr_notify(enum switchdev_notifier_type nt,
|
|
|
|
struct net_device *dev,
|
2021-02-14 04:43:17 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack)
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
{
|
2019-02-28 03:44:31 +08:00
|
|
|
int err;
|
|
|
|
int rc;
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
|
2019-02-28 03:44:31 +08:00
|
|
|
struct switchdev_notifier_port_attr_info attr_info = {
|
|
|
|
.attr = attr,
|
|
|
|
.handled = false,
|
|
|
|
};
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
|
2019-02-28 03:44:31 +08:00
|
|
|
rc = call_switchdev_blocking_notifiers(nt, dev,
|
2021-02-14 04:43:17 +08:00
|
|
|
&attr_info.info, extack);
|
2019-02-28 03:44:31 +08:00
|
|
|
err = notifier_to_errno(rc);
|
|
|
|
if (err) {
|
|
|
|
WARN_ON(!attr_info.handled);
|
|
|
|
return err;
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
}
|
|
|
|
|
2019-02-28 03:44:31 +08:00
|
|
|
if (!attr_info.handled)
|
|
|
|
return -EOPNOTSUPP;
|
2015-10-09 10:23:18 +08:00
|
|
|
|
2019-02-28 03:44:31 +08:00
|
|
|
return 0;
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
}
|
|
|
|
|
2015-10-15 01:40:50 +08:00
|
|
|
static int switchdev_port_attr_set_now(struct net_device *dev,
|
2021-02-14 04:43:17 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack)
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
{
|
2021-02-14 04:43:17 +08:00
|
|
|
return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr,
|
|
|
|
extack);
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
}
|
2015-10-15 01:40:50 +08:00
|
|
|
|
|
|
|
static void switchdev_port_attr_set_deferred(struct net_device *dev,
|
|
|
|
const void *data)
|
|
|
|
{
|
|
|
|
const struct switchdev_attr *attr = data;
|
|
|
|
int err;
|
|
|
|
|
2021-02-14 04:43:17 +08:00
|
|
|
err = switchdev_port_attr_set_now(dev, attr, NULL);
|
2015-10-15 01:40:50 +08:00
|
|
|
if (err && err != -EOPNOTSUPP)
|
|
|
|
netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n",
|
|
|
|
err, attr->id);
|
2016-04-21 18:52:43 +08:00
|
|
|
if (attr->complete)
|
|
|
|
attr->complete(dev, err, attr->complete_priv);
|
2015-10-15 01:40:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int switchdev_port_attr_set_defer(struct net_device *dev,
|
|
|
|
const struct switchdev_attr *attr)
|
|
|
|
{
|
|
|
|
return switchdev_deferred_enqueue(dev, attr, sizeof(*attr),
|
|
|
|
switchdev_port_attr_set_deferred);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* switchdev_port_attr_set - Set port attribute
|
|
|
|
*
|
|
|
|
* @dev: port device
|
|
|
|
* @attr: attribute to set
|
2021-02-14 04:43:17 +08:00
|
|
|
* @extack: netlink extended ack, for error message propagation
|
2015-10-15 01:40:50 +08:00
|
|
|
*
|
|
|
|
* rtnl_lock must be held and must not be in atomic section,
|
|
|
|
* in case SWITCHDEV_F_DEFER flag is not set.
|
|
|
|
*/
|
|
|
|
int switchdev_port_attr_set(struct net_device *dev,
|
2021-02-14 04:43:17 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-10-15 01:40:50 +08:00
|
|
|
{
|
|
|
|
if (attr->flags & SWITCHDEV_F_DEFER)
|
|
|
|
return switchdev_port_attr_set_defer(dev, attr);
|
|
|
|
ASSERT_RTNL();
|
2021-02-14 04:43:17 +08:00
|
|
|
return switchdev_port_attr_set_now(dev, attr, extack);
|
2015-10-15 01:40:50 +08:00
|
|
|
}
|
switchdev: introduce get/set attrs ops
Add two new swdev ops for get/set switch port attributes. Most swdev
interactions on a port are gets or sets on port attributes, so rather than
adding ops for each attribute, let's define clean get/set ops for all
attributes, and then we can have clear, consistent rules on how attributes
propagate on stacked devs.
Add the basic algorithms for get/set attr ops. Use the same recusive algo
to walk lower devs we've used for STP updates, for example. For get,
compare attr value for each lower dev and only return success if attr
values match across all lower devs. For sets, set the same attr value for
all lower devs. We'll use a two-phase prepare-commit transaction model for
sets. In the first phase, the driver(s) are asked if attr set is OK. If
all OK, the commit attr set in second phase. A driver would NACK the
prepare phase if it can't set the attr due to lack of resources or support,
within it's control. RTNL lock must be held across both phases because
we'll recurse all lower devs first in prepare phase, and then recurse all
lower devs again in commit phase. If any lower dev fails the prepare
phase, we need to abort the transaction for all lower devs.
If lower dev recusion isn't desired, allow a flag SWITCHDEV_F_NO_RECURSE to
indicate get/set only work on port (lowest) device.
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-11 00:47:48 +08:00
|
|
|
EXPORT_SYMBOL_GPL(switchdev_port_attr_set);
|
|
|
|
|
2015-10-29 14:17:31 +08:00
|
|
|
static size_t switchdev_obj_size(const struct switchdev_obj *obj)
|
|
|
|
{
|
|
|
|
switch (obj->id) {
|
|
|
|
case SWITCHDEV_OBJ_ID_PORT_VLAN:
|
|
|
|
return sizeof(struct switchdev_obj_port_vlan);
|
2016-01-11 04:06:22 +08:00
|
|
|
case SWITCHDEV_OBJ_ID_PORT_MDB:
|
|
|
|
return sizeof(struct switchdev_obj_port_mdb);
|
2017-11-10 06:10:59 +08:00
|
|
|
case SWITCHDEV_OBJ_ID_HOST_MDB:
|
|
|
|
return sizeof(struct switchdev_obj_port_mdb);
|
2015-10-29 14:17:31 +08:00
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-23 07:32:57 +08:00
|
|
|
static int switchdev_port_obj_notify(enum switchdev_notifier_type nt,
|
|
|
|
struct net_device *dev,
|
|
|
|
const struct switchdev_obj *obj,
|
2018-12-13 01:02:52 +08:00
|
|
|
struct netlink_ext_ack *extack)
|
2015-05-11 00:47:52 +08:00
|
|
|
{
|
2018-11-23 07:32:57 +08:00
|
|
|
int rc;
|
|
|
|
int err;
|
2015-05-11 00:47:52 +08:00
|
|
|
|
2018-11-23 07:32:57 +08:00
|
|
|
struct switchdev_notifier_port_obj_info obj_info = {
|
|
|
|
.obj = obj,
|
|
|
|
.handled = false,
|
|
|
|
};
|
2015-05-11 00:47:52 +08:00
|
|
|
|
2018-12-13 01:02:54 +08:00
|
|
|
rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info, extack);
|
2018-11-23 07:32:57 +08:00
|
|
|
err = notifier_to_errno(rc);
|
|
|
|
if (err) {
|
|
|
|
WARN_ON(!obj_info.handled);
|
|
|
|
return err;
|
2015-05-11 00:47:52 +08:00
|
|
|
}
|
2018-11-23 07:32:57 +08:00
|
|
|
if (!obj_info.handled)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
return 0;
|
2015-05-11 00:47:52 +08:00
|
|
|
}
|
|
|
|
|
2015-10-15 01:40:52 +08:00
|
|
|
static void switchdev_port_obj_add_deferred(struct net_device *dev,
|
|
|
|
const void *data)
|
|
|
|
{
|
|
|
|
const struct switchdev_obj *obj = data;
|
|
|
|
int err;
|
|
|
|
|
2021-01-09 08:01:49 +08:00
|
|
|
ASSERT_RTNL();
|
|
|
|
err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD,
|
|
|
|
dev, obj, NULL);
|
2015-10-15 01:40:52 +08:00
|
|
|
if (err && err != -EOPNOTSUPP)
|
|
|
|
netdev_err(dev, "failed (err=%d) to add object (id=%d)\n",
|
|
|
|
err, obj->id);
|
2016-04-21 18:52:43 +08:00
|
|
|
if (obj->complete)
|
|
|
|
obj->complete(dev, err, obj->complete_priv);
|
2015-10-15 01:40:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int switchdev_port_obj_add_defer(struct net_device *dev,
|
|
|
|
const struct switchdev_obj *obj)
|
|
|
|
{
|
2015-10-29 14:17:31 +08:00
|
|
|
return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj),
|
2015-10-15 01:40:52 +08:00
|
|
|
switchdev_port_obj_add_deferred);
|
|
|
|
}
|
2015-05-11 00:47:52 +08:00
|
|
|
|
|
|
|
/**
|
2015-10-15 01:40:52 +08:00
|
|
|
* switchdev_port_obj_add - Add port object
|
2015-05-11 00:47:52 +08:00
|
|
|
*
|
|
|
|
* @dev: port device
|
2015-10-15 01:40:52 +08:00
|
|
|
* @obj: object to add
|
2020-07-13 07:15:13 +08:00
|
|
|
* @extack: netlink extended ack
|
2015-10-15 01:40:52 +08:00
|
|
|
*
|
|
|
|
* rtnl_lock must be held and must not be in atomic section,
|
|
|
|
* in case SWITCHDEV_F_DEFER flag is not set.
|
2015-05-11 00:47:52 +08:00
|
|
|
*/
|
2015-10-15 01:40:52 +08:00
|
|
|
int switchdev_port_obj_add(struct net_device *dev,
|
2018-12-13 01:02:52 +08:00
|
|
|
const struct switchdev_obj *obj,
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-10-15 01:40:52 +08:00
|
|
|
{
|
|
|
|
if (obj->flags & SWITCHDEV_F_DEFER)
|
|
|
|
return switchdev_port_obj_add_defer(dev, obj);
|
|
|
|
ASSERT_RTNL();
|
2021-01-09 08:01:49 +08:00
|
|
|
return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD,
|
|
|
|
dev, obj, extack);
|
2015-10-15 01:40:52 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_port_obj_add);
|
|
|
|
|
|
|
|
static int switchdev_port_obj_del_now(struct net_device *dev,
|
|
|
|
const struct switchdev_obj *obj)
|
2015-05-11 00:47:52 +08:00
|
|
|
{
|
2018-11-23 07:32:57 +08:00
|
|
|
return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL,
|
net: switchdev: remove the transaction structure from port object notifiers
Since the introduction of the switchdev API, port objects were
transmitted to drivers for offloading using a two-step transactional
model, with a prepare phase that was supposed to catch all errors, and a
commit phase that was supposed to never fail.
Some classes of failures can never be avoided, like hardware access, or
memory allocation. In the latter case, merely attempting to move the
memory allocation to the preparation phase makes it impossible to avoid
memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused
transaction item queue") which has removed the unused mechanism of
passing on the allocated memory between one phase and another.
It is time we admit that separating the preparation from the commit
phase is something that is best left for the driver to decide, and not
something that should be baked into the API, especially since there are
no switchdev callers that depend on this.
This patch removes the struct switchdev_trans member from switchdev port
object notifier structures, and converts drivers to not look at this
member.
Where driver conversion is trivial (like in the case of the Marvell
Prestera driver, NXP DPAA2 switch, TI CPSW, and Rocker drivers), it is
done in this patch.
Where driver conversion needs more attention (DSA, Mellanox Spectrum),
the conversion is left for subsequent patches and here we only fake the
prepare/commit phases at a lower level, just not in the switchdev
notifier itself.
Where the code has a natural structure that is best left alone as a
preparation and a commit phase (as in the case of the Ocelot switch),
that structure is left in place, just made to not depend upon the
switchdev transactional model.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 08:01:48 +08:00
|
|
|
dev, obj, NULL);
|
2015-05-11 00:47:52 +08:00
|
|
|
}
|
2015-10-15 01:40:52 +08:00
|
|
|
|
|
|
|
static void switchdev_port_obj_del_deferred(struct net_device *dev,
|
|
|
|
const void *data)
|
|
|
|
{
|
|
|
|
const struct switchdev_obj *obj = data;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = switchdev_port_obj_del_now(dev, obj);
|
|
|
|
if (err && err != -EOPNOTSUPP)
|
|
|
|
netdev_err(dev, "failed (err=%d) to del object (id=%d)\n",
|
|
|
|
err, obj->id);
|
2016-04-21 18:52:43 +08:00
|
|
|
if (obj->complete)
|
|
|
|
obj->complete(dev, err, obj->complete_priv);
|
2015-10-15 01:40:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int switchdev_port_obj_del_defer(struct net_device *dev,
|
|
|
|
const struct switchdev_obj *obj)
|
|
|
|
{
|
2015-10-29 14:17:31 +08:00
|
|
|
return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj),
|
2015-10-15 01:40:52 +08:00
|
|
|
switchdev_port_obj_del_deferred);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* switchdev_port_obj_del - Delete port object
|
|
|
|
*
|
|
|
|
* @dev: port device
|
|
|
|
* @obj: object to delete
|
|
|
|
*
|
|
|
|
* rtnl_lock must be held and must not be in atomic section,
|
|
|
|
* in case SWITCHDEV_F_DEFER flag is not set.
|
|
|
|
*/
|
|
|
|
int switchdev_port_obj_del(struct net_device *dev,
|
|
|
|
const struct switchdev_obj *obj)
|
|
|
|
{
|
|
|
|
if (obj->flags & SWITCHDEV_F_DEFER)
|
|
|
|
return switchdev_port_obj_del_defer(dev, obj);
|
|
|
|
ASSERT_RTNL();
|
|
|
|
return switchdev_port_obj_del_now(dev, obj);
|
|
|
|
}
|
2015-05-11 00:47:52 +08:00
|
|
|
EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
|
|
|
|
|
net: bridge: switchdev: Skip MDB replays of deferred events on offload
[ Upstream commit dc489f86257cab5056e747344f17a164f63bff4b ]
Before this change, generation of the list of MDB events to replay
would race against the creation of new group memberships, either from
the IGMP/MLD snooping logic or from user configuration.
While new memberships are immediately visible to walkers of
br->mdb_list, the notification of their existence to switchdev event
subscribers is deferred until a later point in time. So if a replay
list was generated during a time that overlapped with such a window,
it would also contain a replay of the not-yet-delivered event.
The driver would thus receive two copies of what the bridge internally
considered to be one single event. On destruction of the bridge, only
a single membership deletion event was therefore sent. As a
consequence of this, drivers which reference count memberships (at
least DSA), would be left with orphan groups in their hardware
database when the bridge was destroyed.
This is only an issue when replaying additions. While deletion events
may still be pending on the deferred queue, they will already have
been removed from br->mdb_list, so no duplicates can be generated in
that scenario.
To a user this meant that old group memberships, from a bridge in
which a port was previously attached, could be reanimated (in
hardware) when the port joined a new bridge, without the new bridge's
knowledge.
For example, on an mv88e6xxx system, create a snooping bridge and
immediately add a port to it:
root@infix-06-0b-00:~$ ip link add dev br0 up type bridge mcast_snooping 1 && \
> ip link set dev x3 up master br0
And then destroy the bridge:
root@infix-06-0b-00:~$ ip link del dev br0
root@infix-06-0b-00:~$ mvls atu
ADDRESS FID STATE Q F 0 1 2 3 4 5 6 7 8 9 a
DEV:0 Marvell 88E6393X
33:33:00:00:00:6a 1 static - - 0 . . . . . . . . . .
33:33:ff:87:e4:3f 1 static - - 0 . . . . . . . . . .
ff:ff:ff:ff:ff:ff 1 static - - 0 1 2 3 4 5 6 7 8 9 a
root@infix-06-0b-00:~$
The two IPv6 groups remain in the hardware database because the
port (x3) is notified of the host's membership twice: once via the
original event and once via a replay. Since only a single delete
notification is sent, the count remains at 1 when the bridge is
destroyed.
Then add the same port (or another port belonging to the same hardware
domain) to a new bridge, this time with snooping disabled:
root@infix-06-0b-00:~$ ip link add dev br1 up type bridge mcast_snooping 0 && \
> ip link set dev x3 up master br1
All multicast, including the two IPv6 groups from br0, should now be
flooded, according to the policy of br1. But instead the old
memberships are still active in the hardware database, causing the
switch to only forward traffic to those groups towards the CPU (port
0).
Eliminate the race in two steps:
1. Grab the write-side lock of the MDB while generating the replay
list.
This prevents new memberships from showing up while we are generating
the replay list. But it leaves the scenario in which a deferred event
was already generated, but not delivered, before we grabbed the
lock. Therefore:
2. Make sure that no deferred version of a replay event is already
enqueued to the switchdev deferred queue, before adding it to the
replay list, when replaying additions.
Fixes: 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries")
Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-15 05:40:03 +08:00
|
|
|
/**
|
|
|
|
* switchdev_port_obj_act_is_deferred - Is object action pending?
|
|
|
|
*
|
|
|
|
* @dev: port device
|
|
|
|
* @nt: type of action; add or delete
|
|
|
|
* @obj: object to test
|
|
|
|
*
|
|
|
|
* Returns true if a deferred item is pending, which is
|
|
|
|
* equivalent to the action @nt on an object @obj.
|
|
|
|
*
|
|
|
|
* rtnl_lock must be held.
|
|
|
|
*/
|
|
|
|
bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
|
|
|
|
enum switchdev_notifier_type nt,
|
|
|
|
const struct switchdev_obj *obj)
|
|
|
|
{
|
|
|
|
struct switchdev_deferred_item *dfitem;
|
|
|
|
bool found = false;
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
|
|
|
|
|
|
|
spin_lock_bh(&deferred_lock);
|
|
|
|
|
|
|
|
list_for_each_entry(dfitem, &deferred, list) {
|
|
|
|
if (dfitem->dev != dev)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if ((dfitem->func == switchdev_port_obj_add_deferred &&
|
|
|
|
nt == SWITCHDEV_PORT_OBJ_ADD) ||
|
|
|
|
(dfitem->func == switchdev_port_obj_del_deferred &&
|
|
|
|
nt == SWITCHDEV_PORT_OBJ_DEL)) {
|
|
|
|
if (switchdev_obj_eq((const void *)dfitem->data, obj)) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock_bh(&deferred_lock);
|
|
|
|
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred);
|
|
|
|
|
2017-06-08 14:44:13 +08:00
|
|
|
static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain);
|
2018-11-23 07:28:25 +08:00
|
|
|
static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain);
|
2015-01-16 06:49:36 +08:00
|
|
|
|
|
|
|
/**
|
2015-05-11 00:47:46 +08:00
|
|
|
* register_switchdev_notifier - Register notifier
|
2015-01-16 06:49:36 +08:00
|
|
|
* @nb: notifier_block
|
|
|
|
*
|
2017-06-08 14:44:13 +08:00
|
|
|
* Register switch device notifier.
|
2015-01-16 06:49:36 +08:00
|
|
|
*/
|
2015-05-11 00:47:46 +08:00
|
|
|
int register_switchdev_notifier(struct notifier_block *nb)
|
2015-01-16 06:49:36 +08:00
|
|
|
{
|
2017-06-08 14:44:13 +08:00
|
|
|
return atomic_notifier_chain_register(&switchdev_notif_chain, nb);
|
2015-01-16 06:49:36 +08:00
|
|
|
}
|
2015-05-11 00:47:46 +08:00
|
|
|
EXPORT_SYMBOL_GPL(register_switchdev_notifier);
|
2015-01-16 06:49:36 +08:00
|
|
|
|
|
|
|
/**
|
2015-05-11 00:47:46 +08:00
|
|
|
* unregister_switchdev_notifier - Unregister notifier
|
2015-01-16 06:49:36 +08:00
|
|
|
* @nb: notifier_block
|
|
|
|
*
|
|
|
|
* Unregister switch device notifier.
|
|
|
|
*/
|
2015-05-11 00:47:46 +08:00
|
|
|
int unregister_switchdev_notifier(struct notifier_block *nb)
|
2015-01-16 06:49:36 +08:00
|
|
|
{
|
2017-06-08 14:44:13 +08:00
|
|
|
return atomic_notifier_chain_unregister(&switchdev_notif_chain, nb);
|
2015-01-16 06:49:36 +08:00
|
|
|
}
|
2015-05-11 00:47:46 +08:00
|
|
|
EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
|
2015-01-16 06:49:36 +08:00
|
|
|
|
|
|
|
/**
|
2015-05-11 00:47:46 +08:00
|
|
|
* call_switchdev_notifiers - Call notifiers
|
2015-01-16 06:49:36 +08:00
|
|
|
* @val: value passed unmodified to notifier function
|
|
|
|
* @dev: port device
|
|
|
|
* @info: notifier information data
|
2020-09-22 21:32:19 +08:00
|
|
|
* @extack: netlink extended ack
|
2017-06-08 14:44:13 +08:00
|
|
|
* Call all network notifier blocks.
|
2015-01-16 06:49:36 +08:00
|
|
|
*/
|
2015-05-11 00:47:46 +08:00
|
|
|
int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
|
2019-01-17 07:06:56 +08:00
|
|
|
struct switchdev_notifier_info *info,
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-01-16 06:49:36 +08:00
|
|
|
{
|
|
|
|
info->dev = dev;
|
2019-01-17 07:06:56 +08:00
|
|
|
info->extack = extack;
|
2017-06-08 14:44:13 +08:00
|
|
|
return atomic_notifier_call_chain(&switchdev_notif_chain, val, info);
|
2015-01-16 06:49:36 +08:00
|
|
|
}
|
2015-05-11 00:47:46 +08:00
|
|
|
EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
|
2015-01-30 14:40:13 +08:00
|
|
|
|
2018-11-23 07:28:25 +08:00
|
|
|
int register_switchdev_blocking_notifier(struct notifier_block *nb)
|
|
|
|
{
|
|
|
|
struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain;
|
|
|
|
|
|
|
|
return blocking_notifier_chain_register(chain, nb);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier);
|
|
|
|
|
|
|
|
int unregister_switchdev_blocking_notifier(struct notifier_block *nb)
|
|
|
|
{
|
|
|
|
struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain;
|
|
|
|
|
|
|
|
return blocking_notifier_chain_unregister(chain, nb);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier);
|
|
|
|
|
|
|
|
int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev,
|
2018-12-13 01:02:54 +08:00
|
|
|
struct switchdev_notifier_info *info,
|
|
|
|
struct netlink_ext_ack *extack)
|
2018-11-23 07:28:25 +08:00
|
|
|
{
|
|
|
|
info->dev = dev;
|
2018-12-13 01:02:54 +08:00
|
|
|
info->extack = extack;
|
2018-11-23 07:28:25 +08:00
|
|
|
return blocking_notifier_call_chain(&switchdev_blocking_notif_chain,
|
|
|
|
val, info);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers);
|
|
|
|
|
net: switchdev: fix FDB entries towards foreign ports not getting propagated to us
The newly introduced switchdev_handle_fdb_{add,del}_to_device helpers
solved a problem but introduced another one. They have a severe design
bug: they do not propagate FDB events on foreign interfaces to us, i.e.
this use case:
br0
/ \
/ \
/ \
/ \
swp0 eno0
(switchdev) (foreign)
when an address is learned on eno0, what is supposed to happen is that
this event should also be propagated towards swp0. Somehow I managed to
convince myself that this did work correctly, but obviously it does not.
The trouble with foreign interfaces is that we must reach a switchdev
net_device pointer through a foreign net_device that has no direct
upper/lower relationship with it. So we need to do exploratory searching
through the lower interfaces of the foreign net_device's bridge upper
(to reach swp0 from eno0, we must check its upper, br0, for lower
interfaces that pass the check_cb and foreign_dev_check_cb). This is
something that the previous code did not do, it just assumed that "dev"
will become a switchdev interface at some point, somehow, probably by
magic.
With this patch, assisted address learning on the CPU port works again
in DSA:
ip link add br0 type bridge
ip link set swp0 master br0
ip link set eno0 master br0
ip link set br0 up
[ 46.708929] mscc_felix 0000:00:00.5 swp0: Adding FDB entry towards eno0, addr 00:04:9f:05:f4:ab vid 0 as host address
Fixes: 8ca07176ab00 ("net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE")
Reported-by: Eric Woudstra <ericwouds@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-22 07:05:55 +08:00
|
|
|
struct switchdev_nested_priv {
|
|
|
|
bool (*check_cb)(const struct net_device *dev);
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev);
|
|
|
|
const struct net_device *dev;
|
|
|
|
struct net_device *lower_dev;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int switchdev_lower_dev_walk(struct net_device *lower_dev,
|
|
|
|
struct netdev_nested_priv *priv)
|
|
|
|
{
|
|
|
|
struct switchdev_nested_priv *switchdev_priv = priv->data;
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev);
|
|
|
|
bool (*check_cb)(const struct net_device *dev);
|
|
|
|
const struct net_device *dev;
|
|
|
|
|
|
|
|
check_cb = switchdev_priv->check_cb;
|
|
|
|
foreign_dev_check_cb = switchdev_priv->foreign_dev_check_cb;
|
|
|
|
dev = switchdev_priv->dev;
|
|
|
|
|
|
|
|
if (check_cb(lower_dev) && !foreign_dev_check_cb(lower_dev, dev)) {
|
|
|
|
switchdev_priv->lower_dev = lower_dev;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct net_device *
|
2022-02-16 01:02:15 +08:00
|
|
|
switchdev_lower_dev_find_rcu(struct net_device *dev,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev))
|
net: switchdev: fix FDB entries towards foreign ports not getting propagated to us
The newly introduced switchdev_handle_fdb_{add,del}_to_device helpers
solved a problem but introduced another one. They have a severe design
bug: they do not propagate FDB events on foreign interfaces to us, i.e.
this use case:
br0
/ \
/ \
/ \
/ \
swp0 eno0
(switchdev) (foreign)
when an address is learned on eno0, what is supposed to happen is that
this event should also be propagated towards swp0. Somehow I managed to
convince myself that this did work correctly, but obviously it does not.
The trouble with foreign interfaces is that we must reach a switchdev
net_device pointer through a foreign net_device that has no direct
upper/lower relationship with it. So we need to do exploratory searching
through the lower interfaces of the foreign net_device's bridge upper
(to reach swp0 from eno0, we must check its upper, br0, for lower
interfaces that pass the check_cb and foreign_dev_check_cb). This is
something that the previous code did not do, it just assumed that "dev"
will become a switchdev interface at some point, somehow, probably by
magic.
With this patch, assisted address learning on the CPU port works again
in DSA:
ip link add br0 type bridge
ip link set swp0 master br0
ip link set eno0 master br0
ip link set br0 up
[ 46.708929] mscc_felix 0000:00:00.5 swp0: Adding FDB entry towards eno0, addr 00:04:9f:05:f4:ab vid 0 as host address
Fixes: 8ca07176ab00 ("net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE")
Reported-by: Eric Woudstra <ericwouds@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-22 07:05:55 +08:00
|
|
|
{
|
|
|
|
struct switchdev_nested_priv switchdev_priv = {
|
|
|
|
.check_cb = check_cb,
|
|
|
|
.foreign_dev_check_cb = foreign_dev_check_cb,
|
|
|
|
.dev = dev,
|
|
|
|
.lower_dev = NULL,
|
|
|
|
};
|
|
|
|
struct netdev_nested_priv priv = {
|
|
|
|
.data = &switchdev_priv,
|
|
|
|
};
|
|
|
|
|
|
|
|
netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv);
|
|
|
|
|
|
|
|
return switchdev_priv.lower_dev;
|
|
|
|
}
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
static struct net_device *
|
|
|
|
switchdev_lower_dev_find(struct net_device *dev,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev))
|
|
|
|
{
|
|
|
|
struct switchdev_nested_priv switchdev_priv = {
|
|
|
|
.check_cb = check_cb,
|
|
|
|
.foreign_dev_check_cb = foreign_dev_check_cb,
|
|
|
|
.dev = dev,
|
|
|
|
.lower_dev = NULL,
|
|
|
|
};
|
|
|
|
struct netdev_nested_priv priv = {
|
|
|
|
.data = &switchdev_priv,
|
|
|
|
};
|
|
|
|
|
|
|
|
netdev_walk_all_lower_dev(dev, switchdev_lower_dev_walk, &priv);
|
|
|
|
|
|
|
|
return switchdev_priv.lower_dev;
|
|
|
|
}
|
|
|
|
|
2021-10-26 22:27:43 +08:00
|
|
|
static int __switchdev_handle_fdb_event_to_device(struct net_device *dev,
|
|
|
|
struct net_device *orig_dev, unsigned long event,
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
2021-10-26 22:27:43 +08:00
|
|
|
int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev,
|
|
|
|
unsigned long event, const void *ctx,
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info))
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
{
|
|
|
|
const struct switchdev_notifier_info *info = &fdb_info->info;
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
struct net_device *br, *lower_dev, *switchdev;
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
struct list_head *iter;
|
|
|
|
int err = -EOPNOTSUPP;
|
|
|
|
|
net: switchdev: fix FDB entries towards foreign ports not getting propagated to us
The newly introduced switchdev_handle_fdb_{add,del}_to_device helpers
solved a problem but introduced another one. They have a severe design
bug: they do not propagate FDB events on foreign interfaces to us, i.e.
this use case:
br0
/ \
/ \
/ \
/ \
swp0 eno0
(switchdev) (foreign)
when an address is learned on eno0, what is supposed to happen is that
this event should also be propagated towards swp0. Somehow I managed to
convince myself that this did work correctly, but obviously it does not.
The trouble with foreign interfaces is that we must reach a switchdev
net_device pointer through a foreign net_device that has no direct
upper/lower relationship with it. So we need to do exploratory searching
through the lower interfaces of the foreign net_device's bridge upper
(to reach swp0 from eno0, we must check its upper, br0, for lower
interfaces that pass the check_cb and foreign_dev_check_cb). This is
something that the previous code did not do, it just assumed that "dev"
will become a switchdev interface at some point, somehow, probably by
magic.
With this patch, assisted address learning on the CPU port works again
in DSA:
ip link add br0 type bridge
ip link set swp0 master br0
ip link set eno0 master br0
ip link set br0 up
[ 46.708929] mscc_felix 0000:00:00.5 swp0: Adding FDB entry towards eno0, addr 00:04:9f:05:f4:ab vid 0 as host address
Fixes: 8ca07176ab00 ("net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE")
Reported-by: Eric Woudstra <ericwouds@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-22 07:05:55 +08:00
|
|
|
if (check_cb(dev))
|
2021-10-26 22:27:43 +08:00
|
|
|
return mod_cb(dev, orig_dev, event, info->ctx, fdb_info);
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
|
|
|
|
/* Recurse through lower interfaces in case the FDB entry is pointing
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
* towards a bridge or a LAG device.
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
*/
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
netdev_for_each_lower_dev(dev, lower_dev, iter) {
|
|
|
|
/* Do not propagate FDB entries across bridges */
|
|
|
|
if (netif_is_bridge_master(lower_dev))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Bridge ports might be either us, or LAG interfaces
|
|
|
|
* that we offload.
|
|
|
|
*/
|
|
|
|
if (!check_cb(lower_dev) &&
|
|
|
|
!switchdev_lower_dev_find_rcu(lower_dev, check_cb,
|
|
|
|
foreign_dev_check_cb))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
err = __switchdev_handle_fdb_event_to_device(lower_dev, orig_dev,
|
|
|
|
event, fdb_info, check_cb,
|
|
|
|
foreign_dev_check_cb,
|
|
|
|
mod_cb);
|
|
|
|
if (err && err != -EOPNOTSUPP)
|
|
|
|
return err;
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
}
|
|
|
|
|
net: switchdev: fix FDB entries towards foreign ports not getting propagated to us
The newly introduced switchdev_handle_fdb_{add,del}_to_device helpers
solved a problem but introduced another one. They have a severe design
bug: they do not propagate FDB events on foreign interfaces to us, i.e.
this use case:
br0
/ \
/ \
/ \
/ \
swp0 eno0
(switchdev) (foreign)
when an address is learned on eno0, what is supposed to happen is that
this event should also be propagated towards swp0. Somehow I managed to
convince myself that this did work correctly, but obviously it does not.
The trouble with foreign interfaces is that we must reach a switchdev
net_device pointer through a foreign net_device that has no direct
upper/lower relationship with it. So we need to do exploratory searching
through the lower interfaces of the foreign net_device's bridge upper
(to reach swp0 from eno0, we must check its upper, br0, for lower
interfaces that pass the check_cb and foreign_dev_check_cb). This is
something that the previous code did not do, it just assumed that "dev"
will become a switchdev interface at some point, somehow, probably by
magic.
With this patch, assisted address learning on the CPU port works again
in DSA:
ip link add br0 type bridge
ip link set swp0 master br0
ip link set eno0 master br0
ip link set br0 up
[ 46.708929] mscc_felix 0000:00:00.5 swp0: Adding FDB entry towards eno0, addr 00:04:9f:05:f4:ab vid 0 as host address
Fixes: 8ca07176ab00 ("net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE")
Reported-by: Eric Woudstra <ericwouds@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-22 07:05:55 +08:00
|
|
|
/* Event is neither on a bridge nor a LAG. Check whether it is on an
|
|
|
|
* interface that is in a bridge with us.
|
|
|
|
*/
|
|
|
|
br = netdev_master_upper_dev_get_rcu(dev);
|
|
|
|
if (!br || !netif_is_bridge_master(br))
|
|
|
|
return 0;
|
|
|
|
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
switchdev = switchdev_lower_dev_find_rcu(br, check_cb, foreign_dev_check_cb);
|
|
|
|
if (!switchdev)
|
net: switchdev: fix FDB entries towards foreign ports not getting propagated to us
The newly introduced switchdev_handle_fdb_{add,del}_to_device helpers
solved a problem but introduced another one. They have a severe design
bug: they do not propagate FDB events on foreign interfaces to us, i.e.
this use case:
br0
/ \
/ \
/ \
/ \
swp0 eno0
(switchdev) (foreign)
when an address is learned on eno0, what is supposed to happen is that
this event should also be propagated towards swp0. Somehow I managed to
convince myself that this did work correctly, but obviously it does not.
The trouble with foreign interfaces is that we must reach a switchdev
net_device pointer through a foreign net_device that has no direct
upper/lower relationship with it. So we need to do exploratory searching
through the lower interfaces of the foreign net_device's bridge upper
(to reach swp0 from eno0, we must check its upper, br0, for lower
interfaces that pass the check_cb and foreign_dev_check_cb). This is
something that the previous code did not do, it just assumed that "dev"
will become a switchdev interface at some point, somehow, probably by
magic.
With this patch, assisted address learning on the CPU port works again
in DSA:
ip link add br0 type bridge
ip link set swp0 master br0
ip link set eno0 master br0
ip link set br0 up
[ 46.708929] mscc_felix 0000:00:00.5 swp0: Adding FDB entry towards eno0, addr 00:04:9f:05:f4:ab vid 0 as host address
Fixes: 8ca07176ab00 ("net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE")
Reported-by: Eric Woudstra <ericwouds@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-22 07:05:55 +08:00
|
|
|
return 0;
|
|
|
|
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
if (!foreign_dev_check_cb(switchdev, dev))
|
|
|
|
return err;
|
|
|
|
|
2021-10-26 22:27:43 +08:00
|
|
|
return __switchdev_handle_fdb_event_to_device(br, orig_dev, event, fdb_info,
|
|
|
|
check_cb, foreign_dev_check_cb,
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
mod_cb);
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
}
|
|
|
|
|
2021-10-26 22:27:43 +08:00
|
|
|
int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event,
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
2021-10-26 22:27:43 +08:00
|
|
|
int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev,
|
|
|
|
unsigned long event, const void *ctx,
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
const struct switchdev_notifier_fdb_info *fdb_info))
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2021-10-26 22:27:43 +08:00
|
|
|
err = __switchdev_handle_fdb_event_to_device(dev, dev, event, fdb_info,
|
|
|
|
check_cb, foreign_dev_check_cb,
|
net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device
When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.
No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:
https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/
however this approach is slightly convoluted because:
- the switchdev driver gets a "dev" that isn't its own net device, but
rather the LAG net device. It must call switchdev_lower_dev_find(dev)
in order to get a handle of any of its own net devices (the ones that
pass check_cb).
- in order for FDB entries on LAG ports to be correctly refcounted per
the number of switchdev ports beneath that LAG, we haven't escaped the
need to iterate through the LAG's lower interfaces. Except that is now
the responsibility of the switchdev driver, because the replication
helper just stopped half-way.
So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.
The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.
Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.
Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-02-23 22:00:50 +08:00
|
|
|
mod_cb);
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
if (err == -EOPNOTSUPP)
|
|
|
|
err = 0;
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
2021-10-26 22:27:43 +08:00
|
|
|
EXPORT_SYMBOL_GPL(switchdev_handle_fdb_event_to_device);
|
net: switchdev: introduce a fanout helper for SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
Currently DSA has an issue with FDB entries pointing towards the bridge
in the presence of br_fdb_replay() being called at port join and leave
time.
In particular, each bridge port will ask for a replay for the FDB
entries pointing towards the bridge when it joins, and for another
replay when it leaves.
This means that for example, a bridge with 4 switch ports will notify
DSA 4 times of the bridge MAC address.
But if the MAC address of the bridge changes during the normal runtime
of the system, the bridge notifies switchdev [ once ] of the deletion of
the old MAC address as a local FDB towards the bridge, and of the
insertion [ again once ] of the new MAC address as a local FDB.
This is a problem, because DSA keeps the old MAC address as a host FDB
entry with refcount 4 (4 ports asked for it using br_fdb_replay). So the
old MAC address will not be deleted. Additionally, the new MAC address
will only be installed with refcount 1, and when the first switch port
leaves the bridge (leaving 3 others as still members), it will delete
with it the new MAC address of the bridge from the local FDB entries
kept by DSA (because the br_fdb_replay call on deletion will bring the
entry's refcount from 1 to 0).
So the problem, really, is that the number of br_fdb_replay() calls is
not matched with the refcount that a host FDB is offloaded to DSA during
normal runtime.
An elegant way to solve the problem would be to make the switchdev
notification emitted by br_fdb_change_mac_address() result in a host FDB
kept by DSA which has a refcount exactly equal to the number of ports
under that bridge. Then, no matter how many DSA ports join or leave that
bridge, the host FDB entry will always be deleted when there are exactly
zero remaining DSA switch ports members of the bridge.
To implement the proposed solution, we remember that the switchdev
objects and port attributes have some helpers provided by switchdev,
which can be optionally called by drivers:
switchdev_handle_port_obj_{add,del} and switchdev_handle_port_attr_set.
These helpers:
- fan out a switchdev object/attribute emitted for the bridge towards
all the lower interfaces that pass the check_cb().
- fan out a switchdev object/attribute emitted for a bridge port that is
a LAG towards all the lower interfaces that pass the check_cb().
In other words, this is the model we need for the FDB events too:
something that will keep an FDB entry emitted towards a physical port as
it is, but translate an FDB entry emitted towards the bridge into N FDB
entries, one per physical port.
Of course, there are many differences between fanning out a switchdev
object (VLAN) on 3 lower interfaces of a LAG and fanning out an FDB
entry on 3 lower interfaces of a LAG. Intuitively, an FDB entry towards
a LAG should be treated specially, because FDB entries are unicast, we
can't just install the same address towards 3 destinations. It is
imaginable that drivers might want to treat this case specifically, so
create some methods for this case and do not recurse into the LAG lower
ports, just the bridge ports.
DSA also listens for FDB entries on "foreign" interfaces, aka interfaces
bridged with us which are not part of our hardware domain: think an
Ethernet switch bridged with a Wi-Fi AP. For those addresses, DSA
installs host FDB entries. However, there we have the same problem
(those host FDB entries are installed with a refcount of only 1) and an
even bigger one which we did not have with FDB entries towards the
bridge:
br_fdb_replay() is currently not called for FDB entries on foreign
interfaces, just for the physical port and for the bridge itself.
So when DSA sniffs an address learned by the software bridge towards a
foreign interface like an e1000 port, and then that e1000 leaves the
bridge, DSA remains with the dangling host FDB address. That will be
fixed separately by replaying all FDB entries and not just the ones
towards the port and the bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 21:51:39 +08:00
|
|
|
|
2018-11-23 07:29:44 +08:00
|
|
|
static int __switchdev_handle_port_obj_add(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*add_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj,
|
2018-12-13 01:02:56 +08:00
|
|
|
struct netlink_ext_ack *extack))
|
2018-11-23 07:29:44 +08:00
|
|
|
{
|
2021-06-27 19:54:24 +08:00
|
|
|
struct switchdev_notifier_info *info = &port_obj_info->info;
|
2022-02-21 20:01:30 +08:00
|
|
|
struct net_device *br, *lower_dev, *switchdev;
|
2018-12-13 01:02:56 +08:00
|
|
|
struct netlink_ext_ack *extack;
|
2018-11-23 07:29:44 +08:00
|
|
|
struct list_head *iter;
|
|
|
|
int err = -EOPNOTSUPP;
|
|
|
|
|
2021-06-27 19:54:24 +08:00
|
|
|
extack = switchdev_notifier_info_to_extack(info);
|
2018-12-13 01:02:56 +08:00
|
|
|
|
2018-11-23 07:29:44 +08:00
|
|
|
if (check_cb(dev)) {
|
2021-06-27 19:54:24 +08:00
|
|
|
err = add_cb(dev, info->ctx, port_obj_info->obj, extack);
|
net: switchdev: don't set port_obj_info->handled true when -EOPNOTSUPP
It's not true that switchdev_port_obj_notify() only inspects the
->handled field of "struct switchdev_notifier_port_obj_info" if
call_switchdev_blocking_notifiers() returns 0 - there's a WARN_ON()
triggering for a non-zero return combined with ->handled not being
true. But the real problem here is that -EOPNOTSUPP is not being
properly handled.
The wrapper functions switchdev_handle_port_obj_add() et al change a
return value of -EOPNOTSUPP to 0, and the treatment of ->handled in
switchdev_port_obj_notify() seems to be designed to change that back
to -EOPNOTSUPP in case nobody actually acted on the notifier (i.e.,
everybody returned -EOPNOTSUPP).
Currently, as soon as some device down the stack passes the check_cb()
check, ->handled gets set to true, which means that
switchdev_port_obj_notify() cannot actually ever return -EOPNOTSUPP.
This, for example, means that the detection of hardware offload
support in the MRP code is broken: switchdev_port_obj_add() used by
br_mrp_switchdev_send_ring_test() always returns 0, so since the MRP
code thinks the generation of MRP test frames has been offloaded, no
such frames are actually put on the wire. Similarly,
br_mrp_switchdev_set_ring_role() also always returns 0, causing
mrp->ring_role_offloaded to be set to 1.
To fix this, continue to set ->handled true if any callback returns
success or any error distinct from -EOPNOTSUPP. But if all the
callbacks return -EOPNOTSUPP, make sure that ->handled stays false, so
the logic in switchdev_port_obj_notify() can propagate that
information.
Fixes: 9a9f26e8f7ea ("bridge: mrp: Connect MRP API with the switchdev API")
Fixes: f30f0601eb93 ("switchdev: Add helpers to aid traversal through lower devices")
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Link: https://lore.kernel.org/r/20210125124116.102928-1-rasmus.villemoes@prevas.dk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-25 20:41:16 +08:00
|
|
|
if (err != -EOPNOTSUPP)
|
|
|
|
port_obj_info->handled = true;
|
|
|
|
return err;
|
2018-11-23 07:29:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Switch ports might be stacked under e.g. a LAG. Ignore the
|
|
|
|
* unsupported devices, another driver might be able to handle them. But
|
|
|
|
* propagate to the callers any hard errors.
|
|
|
|
*
|
|
|
|
* If the driver does its own bookkeeping of stacked ports, it's not
|
|
|
|
* necessary to go through this helper.
|
|
|
|
*/
|
|
|
|
netdev_for_each_lower_dev(dev, lower_dev, iter) {
|
2020-02-27 01:14:21 +08:00
|
|
|
if (netif_is_bridge_master(lower_dev))
|
|
|
|
continue;
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* When searching for switchdev interfaces that are neighbors
|
|
|
|
* of foreign ones, and @dev is a bridge, do not recurse on the
|
|
|
|
* foreign interface again, it was already visited.
|
|
|
|
*/
|
|
|
|
if (foreign_dev_check_cb && !check_cb(lower_dev) &&
|
|
|
|
!switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb))
|
|
|
|
continue;
|
|
|
|
|
2018-11-23 07:29:44 +08:00
|
|
|
err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info,
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
check_cb, foreign_dev_check_cb,
|
|
|
|
add_cb);
|
2018-11-23 07:29:44 +08:00
|
|
|
if (err && err != -EOPNOTSUPP)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* Event is neither on a bridge nor a LAG. Check whether it is on an
|
|
|
|
* interface that is in a bridge with us.
|
|
|
|
*/
|
|
|
|
if (!foreign_dev_check_cb)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
br = netdev_master_upper_dev_get(dev);
|
|
|
|
if (!br || !netif_is_bridge_master(br))
|
|
|
|
return err;
|
|
|
|
|
2022-02-21 20:01:30 +08:00
|
|
|
switchdev = switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb);
|
|
|
|
if (!switchdev)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!foreign_dev_check_cb(switchdev, dev))
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
return err;
|
|
|
|
|
|
|
|
return __switchdev_handle_port_obj_add(br, port_obj_info, check_cb,
|
|
|
|
foreign_dev_check_cb, add_cb);
|
2018-11-23 07:29:44 +08:00
|
|
|
}
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* Pass through a port object addition, if @dev passes @check_cb, or replicate
|
|
|
|
* it towards all lower interfaces of @dev that pass @check_cb, if @dev is a
|
|
|
|
* bridge or a LAG.
|
|
|
|
*/
|
2018-11-23 07:29:44 +08:00
|
|
|
int switchdev_handle_port_obj_add(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*add_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj,
|
2018-12-13 01:02:56 +08:00
|
|
|
struct netlink_ext_ack *extack))
|
2018-11-23 07:29:44 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb,
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
NULL, add_cb);
|
2018-11-23 07:29:44 +08:00
|
|
|
if (err == -EOPNOTSUPP)
|
|
|
|
err = 0;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add);
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* Same as switchdev_handle_port_obj_add(), except if object is notified on a
|
|
|
|
* @dev that passes @foreign_dev_check_cb, it is replicated towards all devices
|
|
|
|
* that pass @check_cb and are in the same bridge as @dev.
|
|
|
|
*/
|
|
|
|
int switchdev_handle_port_obj_add_foreign(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
|
|
|
int (*add_cb)(struct net_device *dev, const void *ctx,
|
|
|
|
const struct switchdev_obj *obj,
|
|
|
|
struct netlink_ext_ack *extack))
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb,
|
|
|
|
foreign_dev_check_cb, add_cb);
|
|
|
|
if (err == -EOPNOTSUPP)
|
|
|
|
err = 0;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add_foreign);
|
|
|
|
|
2018-11-23 07:29:44 +08:00
|
|
|
static int __switchdev_handle_port_obj_del(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*del_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj))
|
|
|
|
{
|
2021-06-27 19:54:24 +08:00
|
|
|
struct switchdev_notifier_info *info = &port_obj_info->info;
|
2022-02-21 20:01:30 +08:00
|
|
|
struct net_device *br, *lower_dev, *switchdev;
|
2018-11-23 07:29:44 +08:00
|
|
|
struct list_head *iter;
|
|
|
|
int err = -EOPNOTSUPP;
|
|
|
|
|
|
|
|
if (check_cb(dev)) {
|
2021-06-27 19:54:24 +08:00
|
|
|
err = del_cb(dev, info->ctx, port_obj_info->obj);
|
net: switchdev: don't set port_obj_info->handled true when -EOPNOTSUPP
It's not true that switchdev_port_obj_notify() only inspects the
->handled field of "struct switchdev_notifier_port_obj_info" if
call_switchdev_blocking_notifiers() returns 0 - there's a WARN_ON()
triggering for a non-zero return combined with ->handled not being
true. But the real problem here is that -EOPNOTSUPP is not being
properly handled.
The wrapper functions switchdev_handle_port_obj_add() et al change a
return value of -EOPNOTSUPP to 0, and the treatment of ->handled in
switchdev_port_obj_notify() seems to be designed to change that back
to -EOPNOTSUPP in case nobody actually acted on the notifier (i.e.,
everybody returned -EOPNOTSUPP).
Currently, as soon as some device down the stack passes the check_cb()
check, ->handled gets set to true, which means that
switchdev_port_obj_notify() cannot actually ever return -EOPNOTSUPP.
This, for example, means that the detection of hardware offload
support in the MRP code is broken: switchdev_port_obj_add() used by
br_mrp_switchdev_send_ring_test() always returns 0, so since the MRP
code thinks the generation of MRP test frames has been offloaded, no
such frames are actually put on the wire. Similarly,
br_mrp_switchdev_set_ring_role() also always returns 0, causing
mrp->ring_role_offloaded to be set to 1.
To fix this, continue to set ->handled true if any callback returns
success or any error distinct from -EOPNOTSUPP. But if all the
callbacks return -EOPNOTSUPP, make sure that ->handled stays false, so
the logic in switchdev_port_obj_notify() can propagate that
information.
Fixes: 9a9f26e8f7ea ("bridge: mrp: Connect MRP API with the switchdev API")
Fixes: f30f0601eb93 ("switchdev: Add helpers to aid traversal through lower devices")
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Link: https://lore.kernel.org/r/20210125124116.102928-1-rasmus.villemoes@prevas.dk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-25 20:41:16 +08:00
|
|
|
if (err != -EOPNOTSUPP)
|
|
|
|
port_obj_info->handled = true;
|
|
|
|
return err;
|
2018-11-23 07:29:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Switch ports might be stacked under e.g. a LAG. Ignore the
|
|
|
|
* unsupported devices, another driver might be able to handle them. But
|
|
|
|
* propagate to the callers any hard errors.
|
|
|
|
*
|
|
|
|
* If the driver does its own bookkeeping of stacked ports, it's not
|
|
|
|
* necessary to go through this helper.
|
|
|
|
*/
|
|
|
|
netdev_for_each_lower_dev(dev, lower_dev, iter) {
|
2020-02-27 01:14:21 +08:00
|
|
|
if (netif_is_bridge_master(lower_dev))
|
|
|
|
continue;
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* When searching for switchdev interfaces that are neighbors
|
|
|
|
* of foreign ones, and @dev is a bridge, do not recurse on the
|
|
|
|
* foreign interface again, it was already visited.
|
|
|
|
*/
|
|
|
|
if (foreign_dev_check_cb && !check_cb(lower_dev) &&
|
|
|
|
!switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb))
|
|
|
|
continue;
|
|
|
|
|
2018-11-23 07:29:44 +08:00
|
|
|
err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info,
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
check_cb, foreign_dev_check_cb,
|
|
|
|
del_cb);
|
2018-11-23 07:29:44 +08:00
|
|
|
if (err && err != -EOPNOTSUPP)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* Event is neither on a bridge nor a LAG. Check whether it is on an
|
|
|
|
* interface that is in a bridge with us.
|
|
|
|
*/
|
|
|
|
if (!foreign_dev_check_cb)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
br = netdev_master_upper_dev_get(dev);
|
|
|
|
if (!br || !netif_is_bridge_master(br))
|
|
|
|
return err;
|
|
|
|
|
2022-02-21 20:01:30 +08:00
|
|
|
switchdev = switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb);
|
|
|
|
if (!switchdev)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!foreign_dev_check_cb(switchdev, dev))
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
return err;
|
|
|
|
|
|
|
|
return __switchdev_handle_port_obj_del(br, port_obj_info, check_cb,
|
|
|
|
foreign_dev_check_cb, del_cb);
|
2018-11-23 07:29:44 +08:00
|
|
|
}
|
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* Pass through a port object deletion, if @dev passes @check_cb, or replicate
|
|
|
|
* it towards all lower interfaces of @dev that pass @check_cb, if @dev is a
|
|
|
|
* bridge or a LAG.
|
|
|
|
*/
|
2018-11-23 07:29:44 +08:00
|
|
|
int switchdev_handle_port_obj_del(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*del_cb)(struct net_device *dev, const void *ctx,
|
2018-11-23 07:29:44 +08:00
|
|
|
const struct switchdev_obj *obj))
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb,
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
NULL, del_cb);
|
2018-11-23 07:29:44 +08:00
|
|
|
if (err == -EOPNOTSUPP)
|
|
|
|
err = 0;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del);
|
2019-02-28 03:44:25 +08:00
|
|
|
|
net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces
The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.
However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").
One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.
To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.
The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.
It looks like this:
br0
/ | \
/ | \
/ | \
swp0 swp1 eth0
1. __switchdev_handle_port_obj_add(eth0)
-> check_cb(eth0) returns false
-> eth0 has no lower interfaces
-> eth0's bridge is br0
-> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
finds br0
2. __switchdev_handle_port_obj_add(br0)
-> check_cb(br0) returns false
-> netdev_for_each_lower_dev
-> check_cb(swp0) returns true, so we don't skip this interface
3. __switchdev_handle_port_obj_add(swp0)
-> check_cb(swp0) returns true, so we call add_cb(swp0)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(swp1) returns true, so we don't skip this interface
4. __switchdev_handle_port_obj_add(swp1)
-> check_cb(swp1) returns true, so we call add_cb(swp1)
(back to netdev_for_each_lower_dev from 2)
-> check_cb(eth0) returns false, so we skip this interface to
avoid infinite recursion
Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).
The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-16 01:02:16 +08:00
|
|
|
/* Same as switchdev_handle_port_obj_del(), except if object is notified on a
|
|
|
|
* @dev that passes @foreign_dev_check_cb, it is replicated towards all devices
|
|
|
|
* that pass @check_cb and are in the same bridge as @dev.
|
|
|
|
*/
|
|
|
|
int switchdev_handle_port_obj_del_foreign(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_obj_info *port_obj_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
|
|
|
bool (*foreign_dev_check_cb)(const struct net_device *dev,
|
|
|
|
const struct net_device *foreign_dev),
|
|
|
|
int (*del_cb)(struct net_device *dev, const void *ctx,
|
|
|
|
const struct switchdev_obj *obj))
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb,
|
|
|
|
foreign_dev_check_cb, del_cb);
|
|
|
|
if (err == -EOPNOTSUPP)
|
|
|
|
err = 0;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del_foreign);
|
|
|
|
|
2019-02-28 03:44:25 +08:00
|
|
|
static int __switchdev_handle_port_attr_set(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_attr_info *port_attr_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*set_cb)(struct net_device *dev, const void *ctx,
|
2021-02-12 23:15:51 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack))
|
2019-02-28 03:44:25 +08:00
|
|
|
{
|
2021-06-27 19:54:24 +08:00
|
|
|
struct switchdev_notifier_info *info = &port_attr_info->info;
|
2021-02-12 23:15:51 +08:00
|
|
|
struct netlink_ext_ack *extack;
|
2019-02-28 03:44:25 +08:00
|
|
|
struct net_device *lower_dev;
|
|
|
|
struct list_head *iter;
|
|
|
|
int err = -EOPNOTSUPP;
|
|
|
|
|
2021-06-27 19:54:24 +08:00
|
|
|
extack = switchdev_notifier_info_to_extack(info);
|
2021-02-12 23:15:51 +08:00
|
|
|
|
2019-02-28 03:44:25 +08:00
|
|
|
if (check_cb(dev)) {
|
2021-06-27 19:54:24 +08:00
|
|
|
err = set_cb(dev, info->ctx, port_attr_info->attr, extack);
|
net: switchdev: don't set port_obj_info->handled true when -EOPNOTSUPP
It's not true that switchdev_port_obj_notify() only inspects the
->handled field of "struct switchdev_notifier_port_obj_info" if
call_switchdev_blocking_notifiers() returns 0 - there's a WARN_ON()
triggering for a non-zero return combined with ->handled not being
true. But the real problem here is that -EOPNOTSUPP is not being
properly handled.
The wrapper functions switchdev_handle_port_obj_add() et al change a
return value of -EOPNOTSUPP to 0, and the treatment of ->handled in
switchdev_port_obj_notify() seems to be designed to change that back
to -EOPNOTSUPP in case nobody actually acted on the notifier (i.e.,
everybody returned -EOPNOTSUPP).
Currently, as soon as some device down the stack passes the check_cb()
check, ->handled gets set to true, which means that
switchdev_port_obj_notify() cannot actually ever return -EOPNOTSUPP.
This, for example, means that the detection of hardware offload
support in the MRP code is broken: switchdev_port_obj_add() used by
br_mrp_switchdev_send_ring_test() always returns 0, so since the MRP
code thinks the generation of MRP test frames has been offloaded, no
such frames are actually put on the wire. Similarly,
br_mrp_switchdev_set_ring_role() also always returns 0, causing
mrp->ring_role_offloaded to be set to 1.
To fix this, continue to set ->handled true if any callback returns
success or any error distinct from -EOPNOTSUPP. But if all the
callbacks return -EOPNOTSUPP, make sure that ->handled stays false, so
the logic in switchdev_port_obj_notify() can propagate that
information.
Fixes: 9a9f26e8f7ea ("bridge: mrp: Connect MRP API with the switchdev API")
Fixes: f30f0601eb93 ("switchdev: Add helpers to aid traversal through lower devices")
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Link: https://lore.kernel.org/r/20210125124116.102928-1-rasmus.villemoes@prevas.dk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-25 20:41:16 +08:00
|
|
|
if (err != -EOPNOTSUPP)
|
|
|
|
port_attr_info->handled = true;
|
|
|
|
return err;
|
2019-02-28 03:44:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Switch ports might be stacked under e.g. a LAG. Ignore the
|
|
|
|
* unsupported devices, another driver might be able to handle them. But
|
|
|
|
* propagate to the callers any hard errors.
|
|
|
|
*
|
|
|
|
* If the driver does its own bookkeeping of stacked ports, it's not
|
|
|
|
* necessary to go through this helper.
|
|
|
|
*/
|
|
|
|
netdev_for_each_lower_dev(dev, lower_dev, iter) {
|
2020-02-27 01:14:21 +08:00
|
|
|
if (netif_is_bridge_master(lower_dev))
|
|
|
|
continue;
|
|
|
|
|
2019-02-28 03:44:25 +08:00
|
|
|
err = __switchdev_handle_port_attr_set(lower_dev, port_attr_info,
|
|
|
|
check_cb, set_cb);
|
|
|
|
if (err && err != -EOPNOTSUPP)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
int switchdev_handle_port_attr_set(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_port_attr_info *port_attr_info,
|
|
|
|
bool (*check_cb)(const struct net_device *dev),
|
2021-06-27 19:54:24 +08:00
|
|
|
int (*set_cb)(struct net_device *dev, const void *ctx,
|
2021-02-12 23:15:51 +08:00
|
|
|
const struct switchdev_attr *attr,
|
|
|
|
struct netlink_ext_ack *extack))
|
2019-02-28 03:44:25 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = __switchdev_handle_port_attr_set(dev, port_attr_info, check_cb,
|
|
|
|
set_cb);
|
|
|
|
if (err == -EOPNOTSUPP)
|
|
|
|
err = 0;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_handle_port_attr_set);
|
net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge
With the introduction of explicit offloading API in switchdev in commit
2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge
ports are offloaded"), we started having Ethernet switch drivers calling
directly into a function exported by net/bridge/br_switchdev.c, which is
a function exported by the bridge driver.
This means that drivers that did not have an explicit dependency on the
bridge before, like cpsw and am65-cpsw, now do - otherwise it is not
possible to call a symbol exported by a driver that can be built as
module unless you are a module too.
There was an attempt to solve the dependency issue in the form of commit
b0e81817629a ("net: build all switchdev drivers as modules when the
bridge is a module"). Grygorii Strashko, however, says about it:
| In my opinion, the problem is a bit bigger here than just fixing the
| build :(
|
| In case, of ^cpsw the switchdev mode is kinda optional and in many
| cases (especially for testing purposes, NFS) the multi-mac mode is
| still preferable mode.
|
| There were no such tight dependency between switchdev drivers and
| bridge core before and switchdev serviced as independent, notification
| based layer between them, so ^cpsw still can be "Y" and bridge can be
| "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE
| will need to be set as "Y", or we will have to update drivers to
| support build with BRIDGE=n and maintain separate builds for
| networking vs non-networking testing. But is this enough? Wouldn't
| it cause 'chain reaction' required to add more and more "Y" options
| (like CONFIG_VLAN_8021Q)?
|
| PS. Just to be sure we on the same page - ARM builds will be forced
| (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our
| automation testing will just fail with omap2plus_defconfig.
In the light of this, it would be desirable for some configurations to
avoid dependencies between switchdev drivers and the bridge, and have
the switchdev mode as completely optional within the driver.
Arnd Bergmann also tried to write a patch which better expressed the
build time dependency for Ethernet switch drivers where the switchdev
support is optional, like cpsw/am65-cpsw, and this made the drivers
follow the bridge (compile as module if the bridge is a module) only if
the optional switchdev support in the driver was enabled in the first
place:
https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/
but this still did not solve the fact that cpsw and am65-cpsw now must
be built as modules when the bridge is a module - it just expressed
correctly that optional dependency. But the new behavior is an apparent
regression from Grygorii's perspective.
So to support the use case where the Ethernet driver is built-in,
NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we
need a framework that can handle the possible absence of the bridge from
the running system, i.e. runtime bloatware as opposed to build-time
bloatware.
Luckily we already have this framework, since switchdev has been using
it extensively. Events from the bridge side are transmitted to the
driver side using notifier chains - this was originally done so that
unrelated drivers could snoop for events emitted by the bridge towards
ports that are implemented by other drivers (think of a switch driver
with LAG offload that listens for switchdev events on a bonding/team
interface that it offloads).
There are also events which are transmitted from the driver side to the
bridge side, which again are modeled using notifiers.
SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with
notifying the bridge that a MAC address has been dynamically learned.
So there is a precedent we can use for modeling the new framework.
The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work
that the bridge needs to do when a port becomes offloaded is blocking in
its nature: replay VLANs, MDBs etc. The calling context is indeed
blocking (we are under rtnl_mutex), but the existing switchdev
notification chain that the bridge is subscribed to is only the atomic
one. So we need to subscribe the bridge to the blocking switchdev
notification chain too.
This patch:
- keeps the driver-side perception of the switchdev_bridge_port_{,un}offload
unchanged
- moves the implementation of switchdev_bridge_port_{,un}offload from
the bridge module into the switchdev module.
- makes everybody that is subscribed to the switchdev blocking notifier
chain "hear" offload & unoffload events
- makes the bridge driver subscribe and handle those events
- moves the bridge driver's handling of those events into 2 new
functions called br_switchdev_port_{,un}offload. These functions
contain in fact the core of the logic that was previously in
switchdev_bridge_port_{,un}offload, just that now we go through an
extra indirection layer to reach them.
Unlike all the other switchdev notification structures, the structure
used to carry the bridge port information, struct
switchdev_notifier_brport_info, does not contain a "bool handled".
This is because in the current usage pattern, we always know that a
switchdev bridge port offloading event will be handled by the bridge,
because the switchdev_bridge_port_offload() call was initiated by a
NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a
bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event
couldn't have happened.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-04 04:34:08 +08:00
|
|
|
|
|
|
|
int switchdev_bridge_port_offload(struct net_device *brport_dev,
|
|
|
|
struct net_device *dev, const void *ctx,
|
|
|
|
struct notifier_block *atomic_nb,
|
|
|
|
struct notifier_block *blocking_nb,
|
|
|
|
bool tx_fwd_offload,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct switchdev_notifier_brport_info brport_info = {
|
|
|
|
.brport = {
|
|
|
|
.dev = dev,
|
|
|
|
.ctx = ctx,
|
|
|
|
.atomic_nb = atomic_nb,
|
|
|
|
.blocking_nb = blocking_nb,
|
|
|
|
.tx_fwd_offload = tx_fwd_offload,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
int err;
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
|
|
|
|
|
|
|
err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_OFFLOADED,
|
|
|
|
brport_dev, &brport_info.info,
|
|
|
|
extack);
|
|
|
|
return notifier_to_errno(err);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload);
|
|
|
|
|
|
|
|
void switchdev_bridge_port_unoffload(struct net_device *brport_dev,
|
|
|
|
const void *ctx,
|
|
|
|
struct notifier_block *atomic_nb,
|
|
|
|
struct notifier_block *blocking_nb)
|
|
|
|
{
|
|
|
|
struct switchdev_notifier_brport_info brport_info = {
|
|
|
|
.brport = {
|
|
|
|
.ctx = ctx,
|
|
|
|
.atomic_nb = atomic_nb,
|
|
|
|
.blocking_nb = blocking_nb,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
|
|
|
|
|
|
|
call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_UNOFFLOADED,
|
|
|
|
brport_dev, &brport_info.info,
|
|
|
|
NULL);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload);
|
2023-07-19 19:01:17 +08:00
|
|
|
|
|
|
|
int switchdev_bridge_port_replay(struct net_device *brport_dev,
|
|
|
|
struct net_device *dev, const void *ctx,
|
|
|
|
struct notifier_block *atomic_nb,
|
|
|
|
struct notifier_block *blocking_nb,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct switchdev_notifier_brport_info brport_info = {
|
|
|
|
.brport = {
|
|
|
|
.dev = dev,
|
|
|
|
.ctx = ctx,
|
|
|
|
.atomic_nb = atomic_nb,
|
|
|
|
.blocking_nb = blocking_nb,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
int err;
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
|
|
|
|
|
|
|
err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_REPLAY,
|
|
|
|
brport_dev, &brport_info.info,
|
|
|
|
extack);
|
|
|
|
return notifier_to_errno(err);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(switchdev_bridge_port_replay);
|