diff --git a/drivers/net/veth.c b/drivers/net/veth.c index bdb7ce3cb054..381670c08ba7 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -224,12 +224,13 @@ static void veth_get_channels(struct net_device *dev, { channels->tx_count = dev->real_num_tx_queues; channels->rx_count = dev->real_num_rx_queues; - channels->max_tx = dev->real_num_tx_queues; - channels->max_rx = dev->real_num_rx_queues; - channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues); - channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues); + channels->max_tx = dev->num_tx_queues; + channels->max_rx = dev->num_rx_queues; } +static int veth_set_channels(struct net_device *dev, + struct ethtool_channels *ch); + static const struct ethtool_ops veth_ethtool_ops = { .get_drvinfo = veth_get_drvinfo, .get_link = ethtool_op_get_link, @@ -239,6 +240,7 @@ static const struct ethtool_ops veth_ethtool_ops = { .get_link_ksettings = veth_get_link_ksettings, .get_ts_info = ethtool_op_get_ts_info, .get_channels = veth_get_channels, + .set_channels = veth_set_channels, }; /* general routines */ @@ -928,12 +930,12 @@ static int veth_poll(struct napi_struct *napi, int budget) return done; } -static int __veth_napi_enable(struct net_device *dev) +static int __veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); @@ -941,7 +943,7 @@ static int __veth_napi_enable(struct net_device *dev) goto err_xdp_ring; } - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; napi_enable(&rq->xdp_napi); @@ -949,19 +951,25 @@ static int __veth_napi_enable(struct net_device *dev) } return 0; + err_xdp_ring: - for (i--; i >= 0; i--) + for (i--; i >= start; i--) ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); return err; } -static void veth_napi_del(struct net_device *dev) +static int __veth_napi_enable(struct net_device *dev) +{ + return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); +} + +static void veth_napi_del_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int i; - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rcu_assign_pointer(priv->rq[i].napi, NULL); @@ -970,7 +978,7 @@ static void veth_napi_del(struct net_device *dev) } synchronize_net(); - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rq->rx_notify_masked = false; @@ -978,11 +986,73 @@ static void veth_napi_del(struct net_device *dev) } } +static void veth_napi_del(struct net_device *dev) +{ + veth_napi_del_range(dev, 0, dev->real_num_rx_queues); +} + static bool veth_gro_requested(const struct net_device *dev) { return !!(dev->wanted_features & NETIF_F_GRO); } +static int veth_enable_xdp_range(struct net_device *dev, int start, int end, + bool napi_already_on) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + for (i = start; i < end; i++) { + struct veth_rq *rq = &priv->rq[i]; + + if (!napi_already_on) + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); + if (err < 0) + goto err_rxq_reg; + + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (err < 0) + goto err_reg_mem; + + /* Save original mem info as it can be overwritten */ + rq->xdp_mem = rq->xdp_rxq.mem; + } + return 0; + +err_reg_mem: + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); +err_rxq_reg: + for (i--; i >= start; i--) { + struct veth_rq *rq = &priv->rq[i]; + + xdp_rxq_info_unreg(&rq->xdp_rxq); + if (!napi_already_on) + netif_napi_del(&rq->xdp_napi); + } + + return err; +} + +static void veth_disable_xdp_range(struct net_device *dev, int start, int end, + bool delete_napi) +{ + struct veth_priv *priv = netdev_priv(dev); + int i; + + for (i = start; i < end; i++) { + struct veth_rq *rq = &priv->rq[i]; + + rq->xdp_rxq.mem = rq->xdp_mem; + xdp_rxq_info_unreg(&rq->xdp_rxq); + + if (delete_napi) + netif_napi_del(&rq->xdp_napi); + } +} + static int veth_enable_xdp(struct net_device *dev) { bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); @@ -990,29 +1060,16 @@ static int veth_enable_xdp(struct net_device *dev) int err, i; if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { - for (i = 0; i < dev->real_num_rx_queues; i++) { - struct veth_rq *rq = &priv->rq[i]; - - if (!napi_already_on) - netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); - err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); - if (err < 0) - goto err_rxq_reg; - - err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, - MEM_TYPE_PAGE_SHARED, - NULL); - if (err < 0) - goto err_reg_mem; - - /* Save original mem info as it can be overwritten */ - rq->xdp_mem = rq->xdp_rxq.mem; - } + err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); + if (err) + return err; if (!napi_already_on) { err = __veth_napi_enable(dev); - if (err) - goto err_rxq_reg; + if (err) { + veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); + return err; + } if (!veth_gro_requested(dev)) { /* user-space did not require GRO, but adding XDP @@ -1030,18 +1087,6 @@ static int veth_enable_xdp(struct net_device *dev) } return 0; -err_reg_mem: - xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); -err_rxq_reg: - for (i--; i >= 0; i--) { - struct veth_rq *rq = &priv->rq[i]; - - xdp_rxq_info_unreg(&rq->xdp_rxq); - if (!napi_already_on) - netif_napi_del(&rq->xdp_napi); - } - - return err; } static void veth_disable_xdp(struct net_device *dev) @@ -1064,28 +1109,23 @@ static void veth_disable_xdp(struct net_device *dev) } } - for (i = 0; i < dev->real_num_rx_queues; i++) { - struct veth_rq *rq = &priv->rq[i]; - - rq->xdp_rxq.mem = rq->xdp_mem; - xdp_rxq_info_unreg(&rq->xdp_rxq); - } + veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); } -static int veth_napi_enable(struct net_device *dev) +static int veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); } - err = __veth_napi_enable(dev); + err = __veth_napi_enable_range(dev, start, end); if (err) { - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_del(&rq->xdp_napi); @@ -1095,6 +1135,128 @@ static int veth_napi_enable(struct net_device *dev) return err; } +static int veth_napi_enable(struct net_device *dev) +{ + return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); +} + +static void veth_disable_range_safe(struct net_device *dev, int start, int end) +{ + struct veth_priv *priv = netdev_priv(dev); + + if (start >= end) + return; + + if (priv->_xdp_prog) { + veth_napi_del_range(dev, start, end); + veth_disable_xdp_range(dev, start, end, false); + } else if (veth_gro_requested(dev)) { + veth_napi_del_range(dev, start, end); + } +} + +static int veth_enable_range_safe(struct net_device *dev, int start, int end) +{ + struct veth_priv *priv = netdev_priv(dev); + int err; + + if (start >= end) + return 0; + + if (priv->_xdp_prog) { + /* these channels are freshly initialized, napi is not on there even + * when GRO is requeste + */ + err = veth_enable_xdp_range(dev, start, end, false); + if (err) + return err; + + err = __veth_napi_enable_range(dev, start, end); + if (err) { + /* on error always delete the newly added napis */ + veth_disable_xdp_range(dev, start, end, true); + return err; + } + } else if (veth_gro_requested(dev)) { + return veth_napi_enable_range(dev, start, end); + } + return 0; +} + +static int veth_set_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct veth_priv *priv = netdev_priv(dev); + unsigned int old_rx_count, new_rx_count; + struct veth_priv *peer_priv; + struct net_device *peer; + int err; + + /* sanity check. Upper bounds are already enforced by the caller */ + if (!ch->rx_count || !ch->tx_count) + return -EINVAL; + + /* avoid braking XDP, if that is enabled */ + peer = rtnl_dereference(priv->peer); + peer_priv = peer ? netdev_priv(peer) : NULL; + if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) + return -EINVAL; + + if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) + return -EINVAL; + + old_rx_count = dev->real_num_rx_queues; + new_rx_count = ch->rx_count; + if (netif_running(dev)) { + /* turn device off */ + netif_carrier_off(dev); + if (peer) + netif_carrier_off(peer); + + /* try to allocate new resurces, as needed*/ + err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); + if (err) + goto out; + } + + err = netif_set_real_num_rx_queues(dev, ch->rx_count); + if (err) + goto revert; + + err = netif_set_real_num_tx_queues(dev, ch->tx_count); + if (err) { + int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); + + /* this error condition could happen only if rx and tx change + * in opposite directions (e.g. tx nr raises, rx nr decreases) + * and we can't do anything to fully restore the original + * status + */ + if (err2) + pr_warn("Can't restore rx queues config %d -> %d %d", + new_rx_count, old_rx_count, err2); + else + goto revert; + } + +out: + if (netif_running(dev)) { + /* note that we need to swap the arguments WRT the enable part + * to identify the range we have to disable + */ + veth_disable_range_safe(dev, new_rx_count, old_rx_count); + netif_carrier_on(dev); + if (peer) + netif_carrier_on(peer); + } + return err; + +revert: + new_rx_count = old_rx_count; + old_rx_count = ch->rx_count; + goto out; +} + static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -1447,6 +1609,23 @@ static void veth_disable_gro(struct net_device *dev) netdev_update_features(dev); } +static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) +{ + int err; + + if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { + err = netif_set_real_num_tx_queues(dev, 1); + if (err) + return err; + } + if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { + err = netif_set_real_num_rx_queues(dev, 1); + if (err) + return err; + } + return 0; +} + static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) @@ -1556,13 +1735,21 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); + err = veth_init_queues(dev, tb); + if (err) + goto err_queues; priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + err = veth_init_queues(peer, tb); + if (err) + goto err_queues; veth_disable_gro(dev); return 0; +err_queues: + unregister_netdevice(dev); err_register_dev: /* nothing to do */ err_configure_peer: @@ -1608,6 +1795,16 @@ static struct net *veth_get_link_net(const struct net_device *dev) return peer ? dev_net(peer) : dev_net(dev); } +static unsigned int veth_get_num_queues(void) +{ + /* enforce the same queue limit as rtnl_create_link */ + int queues = num_possible_cpus(); + + if (queues > 4096) + queues = 4096; + return queues; +} + static struct rtnl_link_ops veth_link_ops = { .kind = DRV_NAME, .priv_size = sizeof(struct veth_priv), @@ -1618,6 +1815,8 @@ static struct rtnl_link_ops veth_link_ops = { .policy = veth_policy, .maxtype = VETH_INFO_MAX, .get_link_net = veth_get_link_net, + .get_num_tx_queues = veth_get_num_queues, + .get_num_rx_queues = veth_get_num_queues, }; /* diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 11d7cdb898c0..19eac3e44c06 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -13,7 +13,7 @@ readonly NS_DST=$BASE$DST readonly BM_NET_V4=192.168.1. readonly BM_NET_V6=2001:db8:: -readonly NPROCS=`nproc` +readonly CPUS=`nproc` ret=0 cleanup() { @@ -75,6 +75,29 @@ chk_tso_flag() { __chk_flag "$1" $2 $3 tcp-segmentation-offload } +chk_channels() { + local msg="$1" + local target=$2 + local rx=$3 + local tx=$4 + + local dev=veth$target + + local cur_rx=`ip netns exec $BASE$target ethtool -l $dev |\ + grep RX: | tail -n 1 | awk '{print $2}' ` + local cur_tx=`ip netns exec $BASE$target ethtool -l $dev |\ + grep TX: | tail -n 1 | awk '{print $2}'` + local cur_combined=`ip netns exec $BASE$target ethtool -l $dev |\ + grep Combined: | tail -n 1 | awk '{print $2}'` + + printf "%-60s" "$msg" + if [ "$cur_rx" = "$rx" -a "$cur_tx" = "$tx" -a "$cur_combined" = "n/a" ]; then + echo " ok " + else + echo " fail rx:$rx:$cur_rx tx:$tx:$cur_tx combined:n/a:$cur_combined" + fi +} + chk_gro() { local msg="$1" local expected=$2 @@ -107,11 +130,100 @@ chk_gro() { fi } +__change_channels() +{ + local cur_cpu + local end=$1 + local cur + local i + + while true; do + printf -v cur '%(%s)T' + [ $cur -le $end ] || break + + for i in `seq 1 $CPUS`; do + ip netns exec $NS_SRC ethtool -L veth$SRC rx $i tx $i + ip netns exec $NS_DST ethtool -L veth$DST rx $i tx $i + done + + for i in `seq 1 $((CPUS - 1))`; do + cur_cpu=$((CPUS - $i)) + ip netns exec $NS_SRC ethtool -L veth$SRC rx $cur_cpu tx $cur_cpu + ip netns exec $NS_DST ethtool -L veth$DST rx $cur_cpu tx $cur_cpu + done + done +} + +__send_data() { + local end=$1 + + while true; do + printf -v cur '%(%s)T' + [ $cur -le $end ] || break + + ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 1000 -M 300 -D $BM_NET_V4$DST + done +} + +do_stress() { + local end + printf -v end '%(%s)T' + end=$((end + $STRESS)) + + ip netns exec $NS_SRC ethtool -L veth$SRC rx 3 tx 3 + ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3 + + ip netns exec $NS_DST ./udpgso_bench_rx & + local rx_pid=$! + + echo "Running stress test for $STRESS seconds..." + __change_channels $end & + local ch_pid=$! + __send_data $end & + local data_pid_1=$! + __send_data $end & + local data_pid_2=$! + __send_data $end & + local data_pid_3=$! + __send_data $end & + local data_pid_4=$! + + wait $ch_pid $data_pid_1 $data_pid_2 $data_pid_3 $data_pid_4 + kill -9 $rx_pid + echo "done" + + # restore previous setting + ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 tx 2 + ip netns exec $NS_DST ethtool -L veth$DST rx 2 tx 1 +} + +usage() { + echo "Usage: $0 [-h] [-s ]" + echo -e "\t-h: show this help" + echo -e "\t-s: run optional stress tests for the given amount of seconds" +} + +STRESS=0 +while getopts "hs:" option; do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "s") + STRESS=$OPTARG + ;; + esac +done + if [ ! -f ../bpf/xdp_dummy.o ]; then echo "Missing xdp_dummy helper. Build bpf selftest first" exit 1 fi +[ $CPUS -lt 2 ] && echo "Only one CPU available, some tests will be skipped" +[ $STRESS -gt 0 -a $CPUS -lt 3 ] && echo " stress test will be skipped, too" + create_ns chk_gro_flag "default - gro flag" $SRC off chk_gro_flag " - peer gro flag" $DST off @@ -134,6 +246,8 @@ chk_gro " - aggregation with TSO off" 1 cleanup create_ns +chk_channels "default channels" $DST 1 1 + ip -n $NS_DST link set dev veth$DST down ip netns exec $NS_DST ethtool -K veth$DST gro on chk_gro_flag "with gro enabled on link down - gro flag" $DST on @@ -147,6 +261,56 @@ chk_gro " - aggregation with TSO off" 1 cleanup create_ns + +CUR_TX=1 +CUR_RX=1 +if [ $CPUS -gt 1 ]; then + ip netns exec $NS_DST ethtool -L veth$DST tx 2 + chk_channels "setting tx channels" $DST 1 2 + CUR_TX=2 +fi + +if [ $CPUS -gt 2 ]; then + ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3 + chk_channels "setting both rx and tx channels" $DST 3 3 + CUR_RX=3 + CUR_TX=3 +fi + +ip netns exec $NS_DST ethtool -L veth$DST combined 2 2>/dev/null +chk_channels "bad setting: combined channels" $DST $CUR_RX $CUR_TX + +ip netns exec $NS_DST ethtool -L veth$DST tx $((CPUS + 1)) 2>/dev/null +chk_channels "setting invalid channels nr" $DST $CUR_RX $CUR_TX + +if [ $CPUS -gt 1 ]; then + # this also tests queues nr reduction + ip netns exec $NS_DST ethtool -L veth$DST rx 1 tx 2 2>/dev/null + ip netns exec $NS_SRC ethtool -L veth$SRC rx 1 tx 2 2>/dev/null + printf "%-60s" "bad setting: XDP with RX nr less than TX" + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + section xdp_dummy 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + + # the following tests will run with multiple channels active + ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 + ip netns exec $NS_DST ethtool -L veth$DST rx 2 + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + section xdp_dummy 2>/dev/null + printf "%-60s" "bad setting: reducing RX nr below peer TX with XDP set" + ip netns exec $NS_DST ethtool -L veth$DST rx 1 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + CUR_RX=2 + CUR_TX=2 +fi + +if [ $CPUS -gt 2 ]; then + printf "%-60s" "bad setting: increasing peer TX nr above RX with XDP set" + ip netns exec $NS_SRC ethtool -L veth$SRC tx 3 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + chk_channels "setting invalid channels nr" $DST 2 2 +fi + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null chk_gro_flag "with xdp attached - gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off @@ -167,10 +331,27 @@ chk_gro_flag " - after gro on xdp off, gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off chk_tso_flag " - tso flag" $SRC on chk_tso_flag " - peer tso flag" $DST on + +if [ $CPUS -gt 1 ]; then + ip netns exec $NS_DST ethtool -L veth$DST tx 1 + chk_channels "decreasing tx channels with device down" $DST 2 1 +fi + ip -n $NS_DST link set dev veth$DST up ip -n $NS_SRC link set dev veth$SRC up chk_gro " - aggregation" 1 +if [ $CPUS -gt 1 ]; then + [ $STRESS -gt 0 -a $CPUS -gt 2 ] && do_stress + + ip -n $NS_DST link set dev veth$DST down + ip -n $NS_SRC link set dev veth$SRC down + ip netns exec $NS_DST ethtool -L veth$DST tx 2 + chk_channels "increasing tx channels with device down" $DST 2 2 + ip -n $NS_DST link set dev veth$DST up + ip -n $NS_SRC link set dev veth$SRC up +fi + ip netns exec $NS_DST ethtool -K veth$DST gro off ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off chk_gro "aggregation again with default and TSO off" 10