Merge branch 'veth-flexible-channel-numbers'

Paolo Abeni says:

====================
veth: more flexible channels number configuration

XDP setups can benefit from multiple veth RX/TX queues. Currently
veth allow setting such number only at creation time via the
'numrxqueues' and 'numtxqueues' parameters.

This series introduces support for the ethtool set_channel operation
and allows configuring the queue number via a new module parameter.

The veth default configuration is not changed.

Finally self-tests are updated to check the new features, with both
valid and invalid arguments.

This iteration is a rebase of the most recent RFC, it does not provide
a module parameter to configure the default number of queues, but I
think could be worthy

RFC v1 -> RFC v2:
 - report more consistent 'combined' count
 - make set_channel as resilient as possible to errors
 - drop module parameter - but I would still consider it.
 - more self-tests
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-07-20 06:11:28 -07:00
commit e4b1dc43ec
2 changed files with 434 additions and 54 deletions

View File

@ -224,12 +224,13 @@ static void veth_get_channels(struct net_device *dev,
{
channels->tx_count = dev->real_num_tx_queues;
channels->rx_count = dev->real_num_rx_queues;
channels->max_tx = dev->real_num_tx_queues;
channels->max_rx = dev->real_num_rx_queues;
channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
channels->max_tx = dev->num_tx_queues;
channels->max_rx = dev->num_rx_queues;
}
static int veth_set_channels(struct net_device *dev,
struct ethtool_channels *ch);
static const struct ethtool_ops veth_ethtool_ops = {
.get_drvinfo = veth_get_drvinfo,
.get_link = ethtool_op_get_link,
@ -239,6 +240,7 @@ static const struct ethtool_ops veth_ethtool_ops = {
.get_link_ksettings = veth_get_link_ksettings,
.get_ts_info = ethtool_op_get_ts_info,
.get_channels = veth_get_channels,
.set_channels = veth_set_channels,
};
/* general routines */
@ -928,12 +930,12 @@ static int veth_poll(struct napi_struct *napi, int budget)
return done;
}
static int __veth_napi_enable(struct net_device *dev)
static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
for (i = 0; i < dev->real_num_rx_queues; i++) {
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
@ -941,7 +943,7 @@ static int __veth_napi_enable(struct net_device *dev)
goto err_xdp_ring;
}
for (i = 0; i < dev->real_num_rx_queues; i++) {
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
napi_enable(&rq->xdp_napi);
@ -949,19 +951,25 @@ static int __veth_napi_enable(struct net_device *dev)
}
return 0;
err_xdp_ring:
for (i--; i >= 0; i--)
for (i--; i >= start; i--)
ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
return err;
}
static void veth_napi_del(struct net_device *dev)
static int __veth_napi_enable(struct net_device *dev)
{
return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
}
static void veth_napi_del_range(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int i;
for (i = 0; i < dev->real_num_rx_queues; i++) {
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
rcu_assign_pointer(priv->rq[i].napi, NULL);
@ -970,7 +978,7 @@ static void veth_napi_del(struct net_device *dev)
}
synchronize_net();
for (i = 0; i < dev->real_num_rx_queues; i++) {
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
rq->rx_notify_masked = false;
@ -978,11 +986,73 @@ static void veth_napi_del(struct net_device *dev)
}
}
static void veth_napi_del(struct net_device *dev)
{
veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
}
static bool veth_gro_requested(const struct net_device *dev)
{
return !!(dev->wanted_features & NETIF_F_GRO);
}
static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
bool napi_already_on)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
if (!napi_already_on)
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
if (err < 0)
goto err_rxq_reg;
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_SHARED,
NULL);
if (err < 0)
goto err_reg_mem;
/* Save original mem info as it can be overwritten */
rq->xdp_mem = rq->xdp_rxq.mem;
}
return 0;
err_reg_mem:
xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
err_rxq_reg:
for (i--; i >= start; i--) {
struct veth_rq *rq = &priv->rq[i];
xdp_rxq_info_unreg(&rq->xdp_rxq);
if (!napi_already_on)
netif_napi_del(&rq->xdp_napi);
}
return err;
}
static void veth_disable_xdp_range(struct net_device *dev, int start, int end,
bool delete_napi)
{
struct veth_priv *priv = netdev_priv(dev);
int i;
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
rq->xdp_rxq.mem = rq->xdp_mem;
xdp_rxq_info_unreg(&rq->xdp_rxq);
if (delete_napi)
netif_napi_del(&rq->xdp_napi);
}
}
static int veth_enable_xdp(struct net_device *dev)
{
bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
@ -990,29 +1060,16 @@ static int veth_enable_xdp(struct net_device *dev)
int err, i;
if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
if (!napi_already_on)
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
if (err < 0)
goto err_rxq_reg;
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_SHARED,
NULL);
if (err < 0)
goto err_reg_mem;
/* Save original mem info as it can be overwritten */
rq->xdp_mem = rq->xdp_rxq.mem;
}
err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on);
if (err)
return err;
if (!napi_already_on) {
err = __veth_napi_enable(dev);
if (err)
goto err_rxq_reg;
if (err) {
veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
return err;
}
if (!veth_gro_requested(dev)) {
/* user-space did not require GRO, but adding XDP
@ -1030,18 +1087,6 @@ static int veth_enable_xdp(struct net_device *dev)
}
return 0;
err_reg_mem:
xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
err_rxq_reg:
for (i--; i >= 0; i--) {
struct veth_rq *rq = &priv->rq[i];
xdp_rxq_info_unreg(&rq->xdp_rxq);
if (!napi_already_on)
netif_napi_del(&rq->xdp_napi);
}
return err;
}
static void veth_disable_xdp(struct net_device *dev)
@ -1064,28 +1109,23 @@ static void veth_disable_xdp(struct net_device *dev)
}
}
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
rq->xdp_rxq.mem = rq->xdp_mem;
xdp_rxq_info_unreg(&rq->xdp_rxq);
}
veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
}
static int veth_napi_enable(struct net_device *dev)
static int veth_napi_enable_range(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
for (i = 0; i < dev->real_num_rx_queues; i++) {
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
}
err = __veth_napi_enable(dev);
err = __veth_napi_enable_range(dev, start, end);
if (err) {
for (i = 0; i < dev->real_num_rx_queues; i++) {
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_del(&rq->xdp_napi);
@ -1095,6 +1135,128 @@ static int veth_napi_enable(struct net_device *dev)
return err;
}
static int veth_napi_enable(struct net_device *dev)
{
return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
}
static void veth_disable_range_safe(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
if (start >= end)
return;
if (priv->_xdp_prog) {
veth_napi_del_range(dev, start, end);
veth_disable_xdp_range(dev, start, end, false);
} else if (veth_gro_requested(dev)) {
veth_napi_del_range(dev, start, end);
}
}
static int veth_enable_range_safe(struct net_device *dev, int start, int end)
{
struct veth_priv *priv = netdev_priv(dev);
int err;
if (start >= end)
return 0;
if (priv->_xdp_prog) {
/* these channels are freshly initialized, napi is not on there even
* when GRO is requeste
*/
err = veth_enable_xdp_range(dev, start, end, false);
if (err)
return err;
err = __veth_napi_enable_range(dev, start, end);
if (err) {
/* on error always delete the newly added napis */
veth_disable_xdp_range(dev, start, end, true);
return err;
}
} else if (veth_gro_requested(dev)) {
return veth_napi_enable_range(dev, start, end);
}
return 0;
}
static int veth_set_channels(struct net_device *dev,
struct ethtool_channels *ch)
{
struct veth_priv *priv = netdev_priv(dev);
unsigned int old_rx_count, new_rx_count;
struct veth_priv *peer_priv;
struct net_device *peer;
int err;
/* sanity check. Upper bounds are already enforced by the caller */
if (!ch->rx_count || !ch->tx_count)
return -EINVAL;
/* avoid braking XDP, if that is enabled */
peer = rtnl_dereference(priv->peer);
peer_priv = peer ? netdev_priv(peer) : NULL;
if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues)
return -EINVAL;
if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues)
return -EINVAL;
old_rx_count = dev->real_num_rx_queues;
new_rx_count = ch->rx_count;
if (netif_running(dev)) {
/* turn device off */
netif_carrier_off(dev);
if (peer)
netif_carrier_off(peer);
/* try to allocate new resurces, as needed*/
err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
if (err)
goto out;
}
err = netif_set_real_num_rx_queues(dev, ch->rx_count);
if (err)
goto revert;
err = netif_set_real_num_tx_queues(dev, ch->tx_count);
if (err) {
int err2 = netif_set_real_num_rx_queues(dev, old_rx_count);
/* this error condition could happen only if rx and tx change
* in opposite directions (e.g. tx nr raises, rx nr decreases)
* and we can't do anything to fully restore the original
* status
*/
if (err2)
pr_warn("Can't restore rx queues config %d -> %d %d",
new_rx_count, old_rx_count, err2);
else
goto revert;
}
out:
if (netif_running(dev)) {
/* note that we need to swap the arguments WRT the enable part
* to identify the range we have to disable
*/
veth_disable_range_safe(dev, new_rx_count, old_rx_count);
netif_carrier_on(dev);
if (peer)
netif_carrier_on(peer);
}
return err;
revert:
new_rx_count = old_rx_count;
old_rx_count = ch->rx_count;
goto out;
}
static int veth_open(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
@ -1447,6 +1609,23 @@ static void veth_disable_gro(struct net_device *dev)
netdev_update_features(dev);
}
static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
{
int err;
if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) {
err = netif_set_real_num_tx_queues(dev, 1);
if (err)
return err;
}
if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) {
err = netif_set_real_num_rx_queues(dev, 1);
if (err)
return err;
}
return 0;
}
static int veth_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
@ -1556,13 +1735,21 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
priv = netdev_priv(dev);
rcu_assign_pointer(priv->peer, peer);
err = veth_init_queues(dev, tb);
if (err)
goto err_queues;
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
err = veth_init_queues(peer, tb);
if (err)
goto err_queues;
veth_disable_gro(dev);
return 0;
err_queues:
unregister_netdevice(dev);
err_register_dev:
/* nothing to do */
err_configure_peer:
@ -1608,6 +1795,16 @@ static struct net *veth_get_link_net(const struct net_device *dev)
return peer ? dev_net(peer) : dev_net(dev);
}
static unsigned int veth_get_num_queues(void)
{
/* enforce the same queue limit as rtnl_create_link */
int queues = num_possible_cpus();
if (queues > 4096)
queues = 4096;
return queues;
}
static struct rtnl_link_ops veth_link_ops = {
.kind = DRV_NAME,
.priv_size = sizeof(struct veth_priv),
@ -1618,6 +1815,8 @@ static struct rtnl_link_ops veth_link_ops = {
.policy = veth_policy,
.maxtype = VETH_INFO_MAX,
.get_link_net = veth_get_link_net,
.get_num_tx_queues = veth_get_num_queues,
.get_num_rx_queues = veth_get_num_queues,
};
/*

View File

@ -13,7 +13,7 @@ readonly NS_DST=$BASE$DST
readonly BM_NET_V4=192.168.1.
readonly BM_NET_V6=2001:db8::
readonly NPROCS=`nproc`
readonly CPUS=`nproc`
ret=0
cleanup() {
@ -75,6 +75,29 @@ chk_tso_flag() {
__chk_flag "$1" $2 $3 tcp-segmentation-offload
}
chk_channels() {
local msg="$1"
local target=$2
local rx=$3
local tx=$4
local dev=veth$target
local cur_rx=`ip netns exec $BASE$target ethtool -l $dev |\
grep RX: | tail -n 1 | awk '{print $2}' `
local cur_tx=`ip netns exec $BASE$target ethtool -l $dev |\
grep TX: | tail -n 1 | awk '{print $2}'`
local cur_combined=`ip netns exec $BASE$target ethtool -l $dev |\
grep Combined: | tail -n 1 | awk '{print $2}'`
printf "%-60s" "$msg"
if [ "$cur_rx" = "$rx" -a "$cur_tx" = "$tx" -a "$cur_combined" = "n/a" ]; then
echo " ok "
else
echo " fail rx:$rx:$cur_rx tx:$tx:$cur_tx combined:n/a:$cur_combined"
fi
}
chk_gro() {
local msg="$1"
local expected=$2
@ -107,11 +130,100 @@ chk_gro() {
fi
}
__change_channels()
{
local cur_cpu
local end=$1
local cur
local i
while true; do
printf -v cur '%(%s)T'
[ $cur -le $end ] || break
for i in `seq 1 $CPUS`; do
ip netns exec $NS_SRC ethtool -L veth$SRC rx $i tx $i
ip netns exec $NS_DST ethtool -L veth$DST rx $i tx $i
done
for i in `seq 1 $((CPUS - 1))`; do
cur_cpu=$((CPUS - $i))
ip netns exec $NS_SRC ethtool -L veth$SRC rx $cur_cpu tx $cur_cpu
ip netns exec $NS_DST ethtool -L veth$DST rx $cur_cpu tx $cur_cpu
done
done
}
__send_data() {
local end=$1
while true; do
printf -v cur '%(%s)T'
[ $cur -le $end ] || break
ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 1000 -M 300 -D $BM_NET_V4$DST
done
}
do_stress() {
local end
printf -v end '%(%s)T'
end=$((end + $STRESS))
ip netns exec $NS_SRC ethtool -L veth$SRC rx 3 tx 3
ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3
ip netns exec $NS_DST ./udpgso_bench_rx &
local rx_pid=$!
echo "Running stress test for $STRESS seconds..."
__change_channels $end &
local ch_pid=$!
__send_data $end &
local data_pid_1=$!
__send_data $end &
local data_pid_2=$!
__send_data $end &
local data_pid_3=$!
__send_data $end &
local data_pid_4=$!
wait $ch_pid $data_pid_1 $data_pid_2 $data_pid_3 $data_pid_4
kill -9 $rx_pid
echo "done"
# restore previous setting
ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 tx 2
ip netns exec $NS_DST ethtool -L veth$DST rx 2 tx 1
}
usage() {
echo "Usage: $0 [-h] [-s <seconds>]"
echo -e "\t-h: show this help"
echo -e "\t-s: run optional stress tests for the given amount of seconds"
}
STRESS=0
while getopts "hs:" option; do
case "$option" in
"h")
usage $0
exit 0
;;
"s")
STRESS=$OPTARG
;;
esac
done
if [ ! -f ../bpf/xdp_dummy.o ]; then
echo "Missing xdp_dummy helper. Build bpf selftest first"
exit 1
fi
[ $CPUS -lt 2 ] && echo "Only one CPU available, some tests will be skipped"
[ $STRESS -gt 0 -a $CPUS -lt 3 ] && echo " stress test will be skipped, too"
create_ns
chk_gro_flag "default - gro flag" $SRC off
chk_gro_flag " - peer gro flag" $DST off
@ -134,6 +246,8 @@ chk_gro " - aggregation with TSO off" 1
cleanup
create_ns
chk_channels "default channels" $DST 1 1
ip -n $NS_DST link set dev veth$DST down
ip netns exec $NS_DST ethtool -K veth$DST gro on
chk_gro_flag "with gro enabled on link down - gro flag" $DST on
@ -147,6 +261,56 @@ chk_gro " - aggregation with TSO off" 1
cleanup
create_ns
CUR_TX=1
CUR_RX=1
if [ $CPUS -gt 1 ]; then
ip netns exec $NS_DST ethtool -L veth$DST tx 2
chk_channels "setting tx channels" $DST 1 2
CUR_TX=2
fi
if [ $CPUS -gt 2 ]; then
ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3
chk_channels "setting both rx and tx channels" $DST 3 3
CUR_RX=3
CUR_TX=3
fi
ip netns exec $NS_DST ethtool -L veth$DST combined 2 2>/dev/null
chk_channels "bad setting: combined channels" $DST $CUR_RX $CUR_TX
ip netns exec $NS_DST ethtool -L veth$DST tx $((CPUS + 1)) 2>/dev/null
chk_channels "setting invalid channels nr" $DST $CUR_RX $CUR_TX
if [ $CPUS -gt 1 ]; then
# this also tests queues nr reduction
ip netns exec $NS_DST ethtool -L veth$DST rx 1 tx 2 2>/dev/null
ip netns exec $NS_SRC ethtool -L veth$SRC rx 1 tx 2 2>/dev/null
printf "%-60s" "bad setting: XDP with RX nr less than TX"
ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \
section xdp_dummy 2>/dev/null &&\
echo "fail - set operation successful ?!?" || echo " ok "
# the following tests will run with multiple channels active
ip netns exec $NS_SRC ethtool -L veth$SRC rx 2
ip netns exec $NS_DST ethtool -L veth$DST rx 2
ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \
section xdp_dummy 2>/dev/null
printf "%-60s" "bad setting: reducing RX nr below peer TX with XDP set"
ip netns exec $NS_DST ethtool -L veth$DST rx 1 2>/dev/null &&\
echo "fail - set operation successful ?!?" || echo " ok "
CUR_RX=2
CUR_TX=2
fi
if [ $CPUS -gt 2 ]; then
printf "%-60s" "bad setting: increasing peer TX nr above RX with XDP set"
ip netns exec $NS_SRC ethtool -L veth$SRC tx 3 2>/dev/null &&\
echo "fail - set operation successful ?!?" || echo " ok "
chk_channels "setting invalid channels nr" $DST 2 2
fi
ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null
chk_gro_flag "with xdp attached - gro flag" $DST on
chk_gro_flag " - peer gro flag" $SRC off
@ -167,10 +331,27 @@ chk_gro_flag " - after gro on xdp off, gro flag" $DST on
chk_gro_flag " - peer gro flag" $SRC off
chk_tso_flag " - tso flag" $SRC on
chk_tso_flag " - peer tso flag" $DST on
if [ $CPUS -gt 1 ]; then
ip netns exec $NS_DST ethtool -L veth$DST tx 1
chk_channels "decreasing tx channels with device down" $DST 2 1
fi
ip -n $NS_DST link set dev veth$DST up
ip -n $NS_SRC link set dev veth$SRC up
chk_gro " - aggregation" 1
if [ $CPUS -gt 1 ]; then
[ $STRESS -gt 0 -a $CPUS -gt 2 ] && do_stress
ip -n $NS_DST link set dev veth$DST down
ip -n $NS_SRC link set dev veth$SRC down
ip netns exec $NS_DST ethtool -L veth$DST tx 2
chk_channels "increasing tx channels with device down" $DST 2 2
ip -n $NS_DST link set dev veth$DST up
ip -n $NS_SRC link set dev veth$SRC up
fi
ip netns exec $NS_DST ethtool -K veth$DST gro off
ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
chk_gro "aggregation again with default and TSO off" 10