2017-11-01 22:08:43 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifndef __LINUX_PKT_SCHED_H
|
|
|
|
#define __LINUX_PKT_SCHED_H
|
|
|
|
|
2019-07-10 05:45:17 +08:00
|
|
|
#include <linux/const.h>
|
2009-01-31 00:37:05 +08:00
|
|
|
#include <linux/types.h>
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Logical priority bands not depending on specific packet scheduler.
|
|
|
|
Every scheduler will map them to real traffic classes, if it has
|
|
|
|
no more precise mechanism to classify packets.
|
|
|
|
|
|
|
|
These numbers have no special meaning, though their coincidence
|
|
|
|
with obsolete IPv6 values is not occasional :-). New IPv6 drafts
|
|
|
|
preferred full anarchy inspired by diffserv group.
|
|
|
|
|
|
|
|
Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy
|
|
|
|
class, actually, as rule it will be handled with more care than
|
|
|
|
filler or even bulk.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define TC_PRIO_BESTEFFORT 0
|
|
|
|
#define TC_PRIO_FILLER 1
|
|
|
|
#define TC_PRIO_BULK 2
|
|
|
|
#define TC_PRIO_INTERACTIVE_BULK 4
|
|
|
|
#define TC_PRIO_INTERACTIVE 6
|
|
|
|
#define TC_PRIO_CONTROL 7
|
|
|
|
|
|
|
|
#define TC_PRIO_MAX 15
|
|
|
|
|
|
|
|
/* Generic queue statistics, available for all the elements.
|
|
|
|
Particular schedulers may have also their private records.
|
|
|
|
*/
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_stats {
|
2011-11-21 14:53:46 +08:00
|
|
|
__u64 bytes; /* Number of enqueued bytes */
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 packets; /* Number of enqueued packets */
|
|
|
|
__u32 drops; /* Packets dropped because of lack of resources */
|
|
|
|
__u32 overlimits; /* Number of throttle events when this
|
|
|
|
* flow goes out of allocated bandwidth */
|
|
|
|
__u32 bps; /* Current flow byte rate */
|
|
|
|
__u32 pps; /* Current flow packet rate */
|
|
|
|
__u32 qlen;
|
|
|
|
__u32 backlog;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_estimator {
|
2005-04-17 06:20:36 +08:00
|
|
|
signed char interval;
|
|
|
|
unsigned char ewma_log;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* "Handles"
|
|
|
|
---------
|
|
|
|
|
|
|
|
All the traffic control objects have 32bit identifiers, or "handles".
|
|
|
|
|
|
|
|
They can be considered as opaque numbers from user API viewpoint,
|
|
|
|
but actually they always consist of two fields: major and
|
|
|
|
minor numbers, which are interpreted by kernel specially,
|
|
|
|
that may be used by applications, though not recommended.
|
|
|
|
|
|
|
|
F.e. qdisc handles always have minor number equal to zero,
|
|
|
|
classes (or flows) have major equal to parent qdisc major, and
|
|
|
|
minor uniquely identifying class inside qdisc.
|
|
|
|
|
|
|
|
Macros to manipulate handles:
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define TC_H_MAJ_MASK (0xFFFF0000U)
|
|
|
|
#define TC_H_MIN_MASK (0x0000FFFFU)
|
|
|
|
#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK)
|
|
|
|
#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK)
|
|
|
|
#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK))
|
|
|
|
|
|
|
|
#define TC_H_UNSPEC (0U)
|
|
|
|
#define TC_H_ROOT (0xFFFFFFFFU)
|
|
|
|
#define TC_H_INGRESS (0xFFFFFFF1U)
|
net, sched: add clsact qdisc
This work adds a generalization of the ingress qdisc as a qdisc holding
only classifiers. The clsact qdisc works on ingress, but also on egress.
In both cases, it's execution happens without taking the qdisc lock, and
the main difference for the egress part compared to prior version of [1]
is that this can be applied with _any_ underlying real egress qdisc (also
classless ones).
Besides solving the use-case of [1], that is, allowing for more programmability
on assigning skb->priority for the mqprio case that is supported by most
popular 10G+ NICs, it also opens up a lot more flexibility for other tc
applications. The main work on classification can already be done at clsact
egress time if the use-case allows and state stored for later retrieval
f.e. again in skb->priority with major/minors (which is checked by most
classful qdiscs before consulting tc_classify()) and/or in other skb fields
like skb->tc_index for some light-weight post-processing to get to the
eventual classid in case of a classful qdisc. Another use case is that
the clsact egress part allows to have a central egress counterpart to
the ingress classifiers, so that classifiers can easily share state (e.g.
in cls_bpf via eBPF maps) for ingress and egress.
Currently, default setups like mq + pfifo_fast would require for this to
use, for example, prio qdisc instead (to get a tc_classify() run) and to
duplicate the egress classifier for each queue. With clsact, it allows
for leaving the setup as is, it can additionally assign skb->priority to
put the skb in one of pfifo_fast's bands and it can share state with maps.
Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid)
w/o the need to perform a skb_dst_force() to hold on to it any longer. In
lwt case, we can also use this facility to setup dst metadata via cls_bpf
(bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for
that (case of IFF_NO_QUEUE devices, for example).
The realization can be done without any changes to the scheduler core
framework. All it takes is that we have two a-priori defined minors/child
classes, where we can mux between ingress and egress classifier list
(dev->ingress_cl_list and dev->egress_cl_list, latter stored close to
dev->_tx to avoid extra cacheline miss for moderate loads). The egress
part is a bit similar modelled to handle_ing() and patched to a noop in
case the functionality is not used. Both handlers are now called
sch_handle_ingress() and sch_handle_egress(), code sharing among the two
doesn't seem practical as there are various minor differences in both
paths, so that making them conditional in a single handler would rather
slow things down.
Full compatibility to ingress qdisc is provided as well. Since both
piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist
per netdevice, and thus ingress qdisc specific behaviour can be retained
for user space. This means, either a user does 'tc qdisc add dev foo ingress'
and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact'
alternative, where both, ingress and egress classifier can be configured
as in the below example. ingress qdisc supports attaching classifier to any
minor number whereas clsact has two fixed minors for muxing between the
lists, therefore to not break user space setups, they are better done as
two separate qdiscs.
I decided to extend the sch_ingress module with clsact functionality so
that commonly used code can be reused, the module is being aliased with
sch_clsact so that it can be auto-loaded properly. Alternative would have been
to add a flag when initializing ingress to alter its behaviour plus aliasing
to a different name (as it's more than just ingress). However, the first would
end up, based on the flag, choosing the new/old behaviour by calling different
function implementations to handle each anyway, the latter would require to
register ingress qdisc once again under different alias. So, this really begs
to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops
by its own that share callbacks used by both.
Example, adding qdisc:
# tc qdisc add dev foo clsact
# tc qdisc show dev foo
qdisc mq 0: root
qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc clsact ffff: parent ffff:fff1
Adding filters (deleting, etc works analogous by specifying ingress/egress):
# tc filter add dev foo ingress bpf da obj bar.o sec ingress
# tc filter add dev foo egress bpf da obj bar.o sec egress
# tc filter show dev foo ingress
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action
# tc filter show dev foo egress
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action
A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will
show an empty list for clsact. Either using the parent names (ingress/egress)
or specifying the full major/minor will then show the related filter lists.
Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend.
[1] http://patchwork.ozlabs.org/patch/512949/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-01-08 05:29:47 +08:00
|
|
|
#define TC_H_CLSACT TC_H_INGRESS
|
|
|
|
|
2017-10-13 02:38:45 +08:00
|
|
|
#define TC_H_MIN_PRIORITY 0xFFE0U
|
net, sched: add clsact qdisc
This work adds a generalization of the ingress qdisc as a qdisc holding
only classifiers. The clsact qdisc works on ingress, but also on egress.
In both cases, it's execution happens without taking the qdisc lock, and
the main difference for the egress part compared to prior version of [1]
is that this can be applied with _any_ underlying real egress qdisc (also
classless ones).
Besides solving the use-case of [1], that is, allowing for more programmability
on assigning skb->priority for the mqprio case that is supported by most
popular 10G+ NICs, it also opens up a lot more flexibility for other tc
applications. The main work on classification can already be done at clsact
egress time if the use-case allows and state stored for later retrieval
f.e. again in skb->priority with major/minors (which is checked by most
classful qdiscs before consulting tc_classify()) and/or in other skb fields
like skb->tc_index for some light-weight post-processing to get to the
eventual classid in case of a classful qdisc. Another use case is that
the clsact egress part allows to have a central egress counterpart to
the ingress classifiers, so that classifiers can easily share state (e.g.
in cls_bpf via eBPF maps) for ingress and egress.
Currently, default setups like mq + pfifo_fast would require for this to
use, for example, prio qdisc instead (to get a tc_classify() run) and to
duplicate the egress classifier for each queue. With clsact, it allows
for leaving the setup as is, it can additionally assign skb->priority to
put the skb in one of pfifo_fast's bands and it can share state with maps.
Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid)
w/o the need to perform a skb_dst_force() to hold on to it any longer. In
lwt case, we can also use this facility to setup dst metadata via cls_bpf
(bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for
that (case of IFF_NO_QUEUE devices, for example).
The realization can be done without any changes to the scheduler core
framework. All it takes is that we have two a-priori defined minors/child
classes, where we can mux between ingress and egress classifier list
(dev->ingress_cl_list and dev->egress_cl_list, latter stored close to
dev->_tx to avoid extra cacheline miss for moderate loads). The egress
part is a bit similar modelled to handle_ing() and patched to a noop in
case the functionality is not used. Both handlers are now called
sch_handle_ingress() and sch_handle_egress(), code sharing among the two
doesn't seem practical as there are various minor differences in both
paths, so that making them conditional in a single handler would rather
slow things down.
Full compatibility to ingress qdisc is provided as well. Since both
piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist
per netdevice, and thus ingress qdisc specific behaviour can be retained
for user space. This means, either a user does 'tc qdisc add dev foo ingress'
and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact'
alternative, where both, ingress and egress classifier can be configured
as in the below example. ingress qdisc supports attaching classifier to any
minor number whereas clsact has two fixed minors for muxing between the
lists, therefore to not break user space setups, they are better done as
two separate qdiscs.
I decided to extend the sch_ingress module with clsact functionality so
that commonly used code can be reused, the module is being aliased with
sch_clsact so that it can be auto-loaded properly. Alternative would have been
to add a flag when initializing ingress to alter its behaviour plus aliasing
to a different name (as it's more than just ingress). However, the first would
end up, based on the flag, choosing the new/old behaviour by calling different
function implementations to handle each anyway, the latter would require to
register ingress qdisc once again under different alias. So, this really begs
to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops
by its own that share callbacks used by both.
Example, adding qdisc:
# tc qdisc add dev foo clsact
# tc qdisc show dev foo
qdisc mq 0: root
qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
qdisc clsact ffff: parent ffff:fff1
Adding filters (deleting, etc works analogous by specifying ingress/egress):
# tc filter add dev foo ingress bpf da obj bar.o sec ingress
# tc filter add dev foo egress bpf da obj bar.o sec egress
# tc filter show dev foo ingress
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action
# tc filter show dev foo egress
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action
A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will
show an empty list for clsact. Either using the parent names (ingress/egress)
or specifying the full major/minor will then show the related filter lists.
Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend.
[1] http://patchwork.ozlabs.org/patch/512949/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-01-08 05:29:47 +08:00
|
|
|
#define TC_H_MIN_INGRESS 0xFFF2U
|
|
|
|
#define TC_H_MIN_EGRESS 0xFFF3U
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-08-15 05:47:11 +08:00
|
|
|
/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */
|
|
|
|
enum tc_link_layer {
|
|
|
|
TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */
|
|
|
|
TC_LINKLAYER_ETHERNET,
|
|
|
|
TC_LINKLAYER_ATM,
|
|
|
|
};
|
|
|
|
#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_ratespec {
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned char cell_log;
|
2013-08-15 05:47:11 +08:00
|
|
|
__u8 linklayer; /* lower 4 bits */
|
2007-09-12 22:36:28 +08:00
|
|
|
unsigned short overhead;
|
|
|
|
short cell_align;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned short mpu;
|
|
|
|
__u32 rate;
|
|
|
|
};
|
|
|
|
|
2008-01-24 12:35:19 +08:00
|
|
|
#define TC_RTAB_SIZE 1024
|
|
|
|
|
2008-07-20 15:08:47 +08:00
|
|
|
struct tc_sizespec {
|
|
|
|
unsigned char cell_log;
|
|
|
|
unsigned char size_log;
|
|
|
|
short cell_align;
|
|
|
|
int overhead;
|
|
|
|
unsigned int linklayer;
|
|
|
|
unsigned int mpu;
|
|
|
|
unsigned int mtu;
|
|
|
|
unsigned int tsize;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_STAB_UNSPEC,
|
|
|
|
TCA_STAB_BASE,
|
|
|
|
TCA_STAB_DATA,
|
|
|
|
__TCA_STAB_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_STAB_MAX (__TCA_STAB_MAX - 1)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* FIFO section */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_fifo_qopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */
|
|
|
|
};
|
|
|
|
|
2018-07-23 22:07:41 +08:00
|
|
|
/* SKBPRIO section */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1).
|
|
|
|
* SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able
|
|
|
|
* to map one to one the DS field of IPV4 and IPV6 headers.
|
|
|
|
* Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define SKBPRIO_MAX_PRIORITY 64
|
|
|
|
|
|
|
|
struct tc_skbprio_qopt {
|
|
|
|
__u32 limit; /* Queue length in packets. */
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* PRIO section */
|
|
|
|
|
|
|
|
#define TCQ_PRIO_BANDS 16
|
2005-11-06 04:14:28 +08:00
|
|
|
#define TCQ_MIN_PRIO_BANDS 2
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_prio_qopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
int bands; /* Number of bands */
|
|
|
|
__u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */
|
|
|
|
};
|
|
|
|
|
2008-09-13 07:29:34 +08:00
|
|
|
/* MULTIQ section */
|
|
|
|
|
|
|
|
struct tc_multiq_qopt {
|
|
|
|
__u16 bands; /* Number of bands */
|
|
|
|
__u16 max_bands; /* Maximum number of queues */
|
|
|
|
};
|
|
|
|
|
2012-02-05 21:51:32 +08:00
|
|
|
/* PLUG section */
|
|
|
|
|
|
|
|
#define TCQ_PLUG_BUFFER 0
|
|
|
|
#define TCQ_PLUG_RELEASE_ONE 1
|
|
|
|
#define TCQ_PLUG_RELEASE_INDEFINITE 2
|
|
|
|
#define TCQ_PLUG_LIMIT 3
|
|
|
|
|
|
|
|
struct tc_plug_qopt {
|
|
|
|
/* TCQ_PLUG_BUFFER: Inset a plug into the queue and
|
|
|
|
* buffer any incoming packets
|
|
|
|
* TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
|
|
|
|
* to beginning of the next plug.
|
|
|
|
* TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
|
|
|
|
* Stop buffering packets until the next TCQ_PLUG_BUFFER
|
|
|
|
* command is received (just act as a pass-thru queue).
|
|
|
|
* TCQ_PLUG_LIMIT: Increase/decrease queue size
|
|
|
|
*/
|
|
|
|
int action;
|
|
|
|
__u32 limit;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* TBF section */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_tbf_qopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct tc_ratespec rate;
|
|
|
|
struct tc_ratespec peakrate;
|
|
|
|
__u32 limit;
|
|
|
|
__u32 buffer;
|
|
|
|
__u32 mtu;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2005-04-17 06:20:36 +08:00
|
|
|
TCA_TBF_UNSPEC,
|
|
|
|
TCA_TBF_PARMS,
|
|
|
|
TCA_TBF_RTAB,
|
|
|
|
TCA_TBF_PTAB,
|
2013-11-08 10:23:34 +08:00
|
|
|
TCA_TBF_RATE64,
|
|
|
|
TCA_TBF_PRATE64,
|
2013-12-20 09:24:47 +08:00
|
|
|
TCA_TBF_BURST,
|
|
|
|
TCA_TBF_PBURST,
|
2016-04-25 16:25:15 +08:00
|
|
|
TCA_TBF_PAD,
|
2005-04-17 06:20:36 +08:00
|
|
|
__TCA_TBF_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_TBF_MAX (__TCA_TBF_MAX - 1)
|
|
|
|
|
|
|
|
|
|
|
|
/* TEQL section */
|
|
|
|
|
|
|
|
/* TEQL does not require any parameters */
|
|
|
|
|
|
|
|
/* SFQ section */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_sfq_qopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned quantum; /* Bytes per round allocated to flow */
|
|
|
|
int perturb_period; /* Period of hash perturbation */
|
|
|
|
__u32 limit; /* Maximal packets in queue */
|
|
|
|
unsigned divisor; /* Hash divisor */
|
|
|
|
unsigned flows; /* Maximal number of flows */
|
|
|
|
};
|
|
|
|
|
net_sched: sfq: add optional RED on top of SFQ
Adds an optional Random Early Detection on each SFQ flow queue.
Traditional SFQ limits count of packets, while RED permits to also
control number of bytes per flow, and adds ECN capability as well.
1) We dont handle the idle time management in this RED implementation,
since each 'new flow' begins with a null qavg. We really want to address
backlogged flows.
2) if headdrop is selected, we try to ecn mark first packet instead of
currently enqueued packet. This gives faster feedback for tcp flows
compared to traditional RED [ marking the last packet in queue ]
Example of use :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
limit 3000 headdrop flows 512 divisor 16384 \
redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
ewma 6 min 8000b max 60000b probability 0.2 ecn
prob_mark 0 prob_mark_head 4876 prob_drop 6131
forced_mark 0 forced_mark_head 0 forced_drop 0
Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
requeues 0)
rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
flows, we can see number of packets CE marked is smaller than number of
drops (for non ECN flows)
If same test is run, without RED, we can check backlog is much bigger.
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Dave Taht <dave.taht@gmail.com>
Tested-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-01-06 14:31:44 +08:00
|
|
|
struct tc_sfqred_stats {
|
|
|
|
__u32 prob_drop; /* Early drops, below max threshold */
|
|
|
|
__u32 forced_drop; /* Early drops, after max threshold */
|
|
|
|
__u32 prob_mark; /* Marked packets, below max threshold */
|
|
|
|
__u32 forced_mark; /* Marked packets, after max threshold */
|
|
|
|
__u32 prob_mark_head; /* Marked packets, below max threshold */
|
|
|
|
__u32 forced_mark_head;/* Marked packets, after max threshold */
|
|
|
|
};
|
|
|
|
|
2012-01-04 22:18:38 +08:00
|
|
|
struct tc_sfq_qopt_v1 {
|
|
|
|
struct tc_sfq_qopt v0;
|
|
|
|
unsigned int depth; /* max number of packets per flow */
|
|
|
|
unsigned int headdrop;
|
net_sched: sfq: add optional RED on top of SFQ
Adds an optional Random Early Detection on each SFQ flow queue.
Traditional SFQ limits count of packets, while RED permits to also
control number of bytes per flow, and adds ECN capability as well.
1) We dont handle the idle time management in this RED implementation,
since each 'new flow' begins with a null qavg. We really want to address
backlogged flows.
2) if headdrop is selected, we try to ecn mark first packet instead of
currently enqueued packet. This gives faster feedback for tcp flows
compared to traditional RED [ marking the last packet in queue ]
Example of use :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
limit 3000 headdrop flows 512 divisor 16384 \
redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
ewma 6 min 8000b max 60000b probability 0.2 ecn
prob_mark 0 prob_mark_head 4876 prob_drop 6131
forced_mark 0 forced_mark_head 0 forced_drop 0
Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
requeues 0)
rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
flows, we can see number of packets CE marked is smaller than number of
drops (for non ECN flows)
If same test is run, without RED, we can check backlog is much bigger.
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Dave Taht <dave.taht@gmail.com>
Tested-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-01-06 14:31:44 +08:00
|
|
|
/* SFQRED parameters */
|
|
|
|
__u32 limit; /* HARD maximal flow queue length (bytes) */
|
|
|
|
__u32 qth_min; /* Min average length threshold (bytes) */
|
|
|
|
__u32 qth_max; /* Max average length threshold (bytes) */
|
|
|
|
unsigned char Wlog; /* log(W) */
|
|
|
|
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
|
|
|
|
unsigned char Scell_log; /* cell size for idle damping */
|
|
|
|
unsigned char flags;
|
|
|
|
__u32 max_P; /* probability, high resolution */
|
|
|
|
/* SFQRED stats */
|
|
|
|
struct tc_sfqred_stats stats;
|
2012-01-04 22:18:38 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_sfq_xstats {
|
2008-02-01 10:37:16 +08:00
|
|
|
__s32 allot;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* RED section */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2005-04-17 06:20:36 +08:00
|
|
|
TCA_RED_UNSPEC,
|
|
|
|
TCA_RED_PARMS,
|
|
|
|
TCA_RED_STAB,
|
sch_red: Adaptative RED AQM
Adaptative RED AQM for linux, based on paper from Sally FLoyd,
Ramakrishna Gummadi, and Scott Shenker, August 2001 :
http://icir.org/floyd/papers/adaptiveRed.pdf
Goal of Adaptative RED is to make max_p a dynamic value between 1% and
50% to reach the target average queue : (max_th - min_th) / 2
Every 500 ms:
if (avg > target and max_p <= 0.5)
increase max_p : max_p += alpha;
else if (avg < target and max_p >= 0.01)
decrease max_p : max_p *= beta;
target :[min_th + 0.4*(min_th - max_th),
min_th + 0.6*(min_th - max_th)].
alpha : min(0.01, max_p / 4)
beta : 0.9
max_P is a Q0.32 fixed point number (unsigned, with 32 bits mantissa)
Changes against our RED implementation are :
max_p is no longer a negative power of two (1/(2^Plog)), but a Q0.32
fixed point number, to allow full range described in Adatative paper.
To deliver a random number, we now use a reciprocal divide (thats really
a multiply), but this operation is done once per marked/droped packet
when in RED_BETWEEN_TRESH window, so added cost (compared to previous
AND operation) is near zero.
dump operation gives current max_p value in a new TCA_RED_MAX_P
attribute.
Example on a 10Mbit link :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 8sec red \
limit 400000 min 30000 max 90000 avpkt 1000 \
burst 55 ecn adaptative bandwidth 10Mbit
# tc -s -d qdisc show dev eth3
...
qdisc red 10: parent 1:1 limit 400000b min 30000b max 90000b ecn
adaptative ewma 5 max_p=0.113335 Scell_log 15
Sent 50414282 bytes 34504 pkt (dropped 35, overlimits 1392 requeues 0)
rate 9749Kbit 831pps backlog 72056b 16p requeues 0
marked 1357 early 35 pdrop 0 other 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-12-08 14:06:03 +08:00
|
|
|
TCA_RED_MAX_P,
|
net: sched: Allow extending set of supported RED flags
The qdiscs RED, GRED, SFQ and CHOKE use different subsets of the same pool
of global RED flags. These are passed in tc_red_qopt.flags. However none of
these qdiscs validate the flag field, and just copy it over wholesale to
internal structures, and later dump it back. (An exception is GRED, which
does validate for VQs -- however not for the main setup.)
A broken userspace can therefore configure a qdisc with arbitrary
unsupported flags, and later expect to see the flags on qdisc dump. The
current ABI therefore allows storage of several bits of custom data to
qdisc instances of the types mentioned above. How many bits, depends on
which flags are meaningful for the qdisc in question. E.g. SFQ recognizes
flags ECN and HARDDROP, and the rest is not interpreted.
If SFQ ever needs to support ADAPTATIVE, it needs another way of doing it,
and at the same time it needs to retain the possibility to store 6 bits of
uninterpreted data. Likewise RED, which adds a new flag later in this
patchset.
To that end, this patch adds a new function, red_get_flags(), to split the
passed flags of RED-like qdiscs to flags and user bits, and
red_validate_flags() to validate the resulting configuration. It further
adds a new attribute, TCA_RED_FLAGS, to pass arbitrary flags.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-13 07:10:56 +08:00
|
|
|
TCA_RED_FLAGS, /* bitfield32 */
|
2020-06-27 06:45:28 +08:00
|
|
|
TCA_RED_EARLY_DROP_BLOCK, /* u32 */
|
|
|
|
TCA_RED_MARK_BLOCK, /* u32 */
|
2005-04-17 06:20:36 +08:00
|
|
|
__TCA_RED_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_RED_MAX (__TCA_RED_MAX - 1)
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_red_qopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 limit; /* HARD maximal queue length (bytes) */
|
|
|
|
__u32 qth_min; /* Min average length threshold (bytes) */
|
|
|
|
__u32 qth_max; /* Max average length threshold (bytes) */
|
|
|
|
unsigned char Wlog; /* log(W) */
|
|
|
|
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
|
|
|
|
unsigned char Scell_log; /* cell size for idle damping */
|
net: sched: Allow extending set of supported RED flags
The qdiscs RED, GRED, SFQ and CHOKE use different subsets of the same pool
of global RED flags. These are passed in tc_red_qopt.flags. However none of
these qdiscs validate the flag field, and just copy it over wholesale to
internal structures, and later dump it back. (An exception is GRED, which
does validate for VQs -- however not for the main setup.)
A broken userspace can therefore configure a qdisc with arbitrary
unsupported flags, and later expect to see the flags on qdisc dump. The
current ABI therefore allows storage of several bits of custom data to
qdisc instances of the types mentioned above. How many bits, depends on
which flags are meaningful for the qdisc in question. E.g. SFQ recognizes
flags ECN and HARDDROP, and the rest is not interpreted.
If SFQ ever needs to support ADAPTATIVE, it needs another way of doing it,
and at the same time it needs to retain the possibility to store 6 bits of
uninterpreted data. Likewise RED, which adds a new flag later in this
patchset.
To that end, this patch adds a new function, red_get_flags(), to split the
passed flags of RED-like qdiscs to flags and user bits, and
red_validate_flags() to validate the resulting configuration. It further
adds a new attribute, TCA_RED_FLAGS, to pass arbitrary flags.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-13 07:10:56 +08:00
|
|
|
|
|
|
|
/* This field can be used for flags that a RED-like qdisc has
|
|
|
|
* historically supported. E.g. when configuring RED, it can be used for
|
|
|
|
* ECN, HARDDROP and ADAPTATIVE. For SFQ it can be used for ECN,
|
|
|
|
* HARDDROP. Etc. Because this field has not been validated, and is
|
|
|
|
* copied back on dump, any bits besides those to which a given qdisc
|
|
|
|
* has assigned a historical meaning need to be considered for free use
|
|
|
|
* by userspace tools.
|
|
|
|
*
|
|
|
|
* Any further flags need to be passed differently, e.g. through an
|
|
|
|
* attribute (such as TCA_RED_FLAGS above). Such attribute should allow
|
|
|
|
* passing both recent and historic flags in one value.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned char flags;
|
sch_red: Adaptative RED AQM
Adaptative RED AQM for linux, based on paper from Sally FLoyd,
Ramakrishna Gummadi, and Scott Shenker, August 2001 :
http://icir.org/floyd/papers/adaptiveRed.pdf
Goal of Adaptative RED is to make max_p a dynamic value between 1% and
50% to reach the target average queue : (max_th - min_th) / 2
Every 500 ms:
if (avg > target and max_p <= 0.5)
increase max_p : max_p += alpha;
else if (avg < target and max_p >= 0.01)
decrease max_p : max_p *= beta;
target :[min_th + 0.4*(min_th - max_th),
min_th + 0.6*(min_th - max_th)].
alpha : min(0.01, max_p / 4)
beta : 0.9
max_P is a Q0.32 fixed point number (unsigned, with 32 bits mantissa)
Changes against our RED implementation are :
max_p is no longer a negative power of two (1/(2^Plog)), but a Q0.32
fixed point number, to allow full range described in Adatative paper.
To deliver a random number, we now use a reciprocal divide (thats really
a multiply), but this operation is done once per marked/droped packet
when in RED_BETWEEN_TRESH window, so added cost (compared to previous
AND operation) is near zero.
dump operation gives current max_p value in a new TCA_RED_MAX_P
attribute.
Example on a 10Mbit link :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 8sec red \
limit 400000 min 30000 max 90000 avpkt 1000 \
burst 55 ecn adaptative bandwidth 10Mbit
# tc -s -d qdisc show dev eth3
...
qdisc red 10: parent 1:1 limit 400000b min 30000b max 90000b ecn
adaptative ewma 5 max_p=0.113335 Scell_log 15
Sent 50414282 bytes 34504 pkt (dropped 35, overlimits 1392 requeues 0)
rate 9749Kbit 831pps backlog 72056b 16p requeues 0
marked 1357 early 35 pdrop 0 other 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-12-08 14:06:03 +08:00
|
|
|
#define TC_RED_ECN 1
|
|
|
|
#define TC_RED_HARDDROP 2
|
|
|
|
#define TC_RED_ADAPTATIVE 4
|
2020-03-13 07:10:57 +08:00
|
|
|
#define TC_RED_NODROP 8
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
net: sched: Allow extending set of supported RED flags
The qdiscs RED, GRED, SFQ and CHOKE use different subsets of the same pool
of global RED flags. These are passed in tc_red_qopt.flags. However none of
these qdiscs validate the flag field, and just copy it over wholesale to
internal structures, and later dump it back. (An exception is GRED, which
does validate for VQs -- however not for the main setup.)
A broken userspace can therefore configure a qdisc with arbitrary
unsupported flags, and later expect to see the flags on qdisc dump. The
current ABI therefore allows storage of several bits of custom data to
qdisc instances of the types mentioned above. How many bits, depends on
which flags are meaningful for the qdisc in question. E.g. SFQ recognizes
flags ECN and HARDDROP, and the rest is not interpreted.
If SFQ ever needs to support ADAPTATIVE, it needs another way of doing it,
and at the same time it needs to retain the possibility to store 6 bits of
uninterpreted data. Likewise RED, which adds a new flag later in this
patchset.
To that end, this patch adds a new function, red_get_flags(), to split the
passed flags of RED-like qdiscs to flags and user bits, and
red_validate_flags() to validate the resulting configuration. It further
adds a new attribute, TCA_RED_FLAGS, to pass arbitrary flags.
Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-13 07:10:56 +08:00
|
|
|
#define TC_RED_HISTORIC_FLAGS (TC_RED_ECN | TC_RED_HARDDROP | TC_RED_ADAPTATIVE)
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_red_xstats {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 early; /* Early drops */
|
|
|
|
__u32 pdrop; /* Drops due to queue limits */
|
|
|
|
__u32 other; /* Drops due to drop() calls */
|
|
|
|
__u32 marked; /* Marked packets */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* GRED section */
|
|
|
|
|
|
|
|
#define MAX_DPs 16
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2005-04-17 06:20:36 +08:00
|
|
|
TCA_GRED_UNSPEC,
|
|
|
|
TCA_GRED_PARMS,
|
|
|
|
TCA_GRED_STAB,
|
|
|
|
TCA_GRED_DPS,
|
2011-12-09 10:46:45 +08:00
|
|
|
TCA_GRED_MAX_P,
|
2015-05-10 10:01:46 +08:00
|
|
|
TCA_GRED_LIMIT,
|
2018-11-15 14:23:49 +08:00
|
|
|
TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */
|
2015-05-10 10:01:46 +08:00
|
|
|
__TCA_GRED_MAX,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
|
|
|
|
|
2018-11-15 14:23:49 +08:00
|
|
|
enum {
|
|
|
|
TCA_GRED_VQ_ENTRY_UNSPEC,
|
|
|
|
TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */
|
|
|
|
__TCA_GRED_VQ_ENTRY_MAX,
|
|
|
|
};
|
|
|
|
#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_GRED_VQ_UNSPEC,
|
|
|
|
TCA_GRED_VQ_PAD,
|
|
|
|
TCA_GRED_VQ_DP, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_BYTES, /* u64 */
|
|
|
|
TCA_GRED_VQ_STAT_PACKETS, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_BACKLOG, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_PDROP, /* u32 */
|
|
|
|
TCA_GRED_VQ_STAT_OTHER, /* u32 */
|
2018-11-15 14:23:51 +08:00
|
|
|
TCA_GRED_VQ_FLAGS, /* u32 */
|
2018-11-15 14:23:49 +08:00
|
|
|
__TCA_GRED_VQ_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1)
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_gred_qopt {
|
2005-11-06 04:14:25 +08:00
|
|
|
__u32 limit; /* HARD maximal queue length (bytes) */
|
|
|
|
__u32 qth_min; /* Min average length threshold (bytes) */
|
|
|
|
__u32 qth_max; /* Max average length threshold (bytes) */
|
2011-03-31 09:57:33 +08:00
|
|
|
__u32 DP; /* up to 2^32 DPs */
|
2005-11-06 04:14:25 +08:00
|
|
|
__u32 backlog;
|
|
|
|
__u32 qave;
|
|
|
|
__u32 forced;
|
|
|
|
__u32 early;
|
|
|
|
__u32 other;
|
|
|
|
__u32 pdrop;
|
|
|
|
__u8 Wlog; /* log(W) */
|
|
|
|
__u8 Plog; /* log(P_max/(qth_max-qth_min)) */
|
|
|
|
__u8 Scell_log; /* cell size for idle damping */
|
|
|
|
__u8 prio; /* prio of this VQ */
|
|
|
|
__u32 packets;
|
|
|
|
__u32 bytesin;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
2005-11-06 04:14:25 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* gred setup */
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_gred_sopt {
|
2005-11-06 04:14:25 +08:00
|
|
|
__u32 DPs;
|
|
|
|
__u32 def_DP;
|
|
|
|
__u8 grio;
|
2005-11-06 04:14:27 +08:00
|
|
|
__u8 flags;
|
|
|
|
__u16 pad1;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2011-02-02 23:21:10 +08:00
|
|
|
/* CHOKe section */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_CHOKE_UNSPEC,
|
|
|
|
TCA_CHOKE_PARMS,
|
|
|
|
TCA_CHOKE_STAB,
|
2011-12-09 10:46:45 +08:00
|
|
|
TCA_CHOKE_MAX_P,
|
2011-02-02 23:21:10 +08:00
|
|
|
__TCA_CHOKE_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
|
|
|
|
|
|
|
|
struct tc_choke_qopt {
|
|
|
|
__u32 limit; /* Hard queue length (packets) */
|
|
|
|
__u32 qth_min; /* Min average threshold (packets) */
|
|
|
|
__u32 qth_max; /* Max average threshold (packets) */
|
|
|
|
unsigned char Wlog; /* log(W) */
|
|
|
|
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
|
|
|
|
unsigned char Scell_log; /* cell size for idle damping */
|
|
|
|
unsigned char flags; /* see RED flags */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tc_choke_xstats {
|
|
|
|
__u32 early; /* Early drops */
|
|
|
|
__u32 pdrop; /* Drops due to queue limits */
|
|
|
|
__u32 other; /* Drops due to drop() calls */
|
|
|
|
__u32 marked; /* Marked packets */
|
|
|
|
__u32 matched; /* Drops due to flow match */
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* HTB section */
|
|
|
|
#define TC_HTB_NUMPRIO 8
|
|
|
|
#define TC_HTB_MAXDEPTH 8
|
|
|
|
#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_htb_opt {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct tc_ratespec rate;
|
|
|
|
struct tc_ratespec ceil;
|
|
|
|
__u32 buffer;
|
|
|
|
__u32 cbuffer;
|
|
|
|
__u32 quantum;
|
|
|
|
__u32 level; /* out only */
|
|
|
|
__u32 prio;
|
|
|
|
};
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_htb_glob {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 version; /* to match HTB/TC */
|
|
|
|
__u32 rate2quantum; /* bps->quantum divisor */
|
|
|
|
__u32 defcls; /* default class number */
|
|
|
|
__u32 debug; /* debug flags */
|
|
|
|
|
|
|
|
/* stats */
|
2011-11-21 14:53:46 +08:00
|
|
|
__u32 direct_pkts; /* count of non shaped packets */
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2005-04-17 06:20:36 +08:00
|
|
|
TCA_HTB_UNSPEC,
|
|
|
|
TCA_HTB_PARMS,
|
|
|
|
TCA_HTB_INIT,
|
|
|
|
TCA_HTB_CTAB,
|
|
|
|
TCA_HTB_RTAB,
|
2013-03-06 14:49:21 +08:00
|
|
|
TCA_HTB_DIRECT_QLEN,
|
2013-09-20 00:10:20 +08:00
|
|
|
TCA_HTB_RATE64,
|
|
|
|
TCA_HTB_CEIL64,
|
2016-04-25 16:25:15 +08:00
|
|
|
TCA_HTB_PAD,
|
2005-04-17 06:20:36 +08:00
|
|
|
__TCA_HTB_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_HTB_MAX (__TCA_HTB_MAX - 1)
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_htb_xstats {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 lends;
|
|
|
|
__u32 borrows;
|
2018-08-30 22:39:23 +08:00
|
|
|
__u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */
|
|
|
|
__s32 tokens;
|
|
|
|
__s32 ctokens;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/* HFSC section */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_hfsc_qopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u16 defcls; /* default class */
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_service_curve {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 m1; /* slope of the first segment in bps */
|
|
|
|
__u32 d; /* x-projection of the first segment in us */
|
|
|
|
__u32 m2; /* slope of the second segment in bps */
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_hfsc_stats {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u64 work; /* total work done */
|
|
|
|
__u64 rtwork; /* work done by real-time criteria */
|
|
|
|
__u32 period; /* current period */
|
|
|
|
__u32 level; /* class level in hierarchy */
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2005-04-17 06:20:36 +08:00
|
|
|
TCA_HFSC_UNSPEC,
|
|
|
|
TCA_HFSC_RSC,
|
|
|
|
TCA_HFSC_FSC,
|
|
|
|
TCA_HFSC_USC,
|
|
|
|
__TCA_HFSC_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1)
|
|
|
|
|
|
|
|
|
|
|
|
/* CBQ section */
|
|
|
|
|
|
|
|
#define TC_CBQ_MAXPRIO 8
|
|
|
|
#define TC_CBQ_MAXLEVEL 8
|
|
|
|
#define TC_CBQ_DEF_EWMA 5
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_cbq_lssopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned char change;
|
|
|
|
unsigned char flags;
|
|
|
|
#define TCF_CBQ_LSS_BOUNDED 1
|
|
|
|
#define TCF_CBQ_LSS_ISOLATED 2
|
|
|
|
unsigned char ewma_log;
|
|
|
|
unsigned char level;
|
|
|
|
#define TCF_CBQ_LSS_FLAGS 1
|
|
|
|
#define TCF_CBQ_LSS_EWMA 2
|
|
|
|
#define TCF_CBQ_LSS_MAXIDLE 4
|
|
|
|
#define TCF_CBQ_LSS_MINIDLE 8
|
|
|
|
#define TCF_CBQ_LSS_OFFTIME 0x10
|
|
|
|
#define TCF_CBQ_LSS_AVPKT 0x20
|
|
|
|
__u32 maxidle;
|
|
|
|
__u32 minidle;
|
|
|
|
__u32 offtime;
|
|
|
|
__u32 avpkt;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_cbq_wrropt {
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned char flags;
|
|
|
|
unsigned char priority;
|
|
|
|
unsigned char cpriority;
|
|
|
|
unsigned char __reserved;
|
|
|
|
__u32 allot;
|
|
|
|
__u32 weight;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_cbq_ovl {
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned char strategy;
|
|
|
|
#define TC_CBQ_OVL_CLASSIC 0
|
|
|
|
#define TC_CBQ_OVL_DELAY 1
|
|
|
|
#define TC_CBQ_OVL_LOWPRIO 2
|
|
|
|
#define TC_CBQ_OVL_DROP 3
|
|
|
|
#define TC_CBQ_OVL_RCLASSIC 4
|
|
|
|
unsigned char priority2;
|
2005-06-29 03:56:45 +08:00
|
|
|
__u16 pad;
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 penalty;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_cbq_police {
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned char police;
|
|
|
|
unsigned char __res1;
|
|
|
|
unsigned short __res2;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_cbq_fopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 split;
|
|
|
|
__u32 defmap;
|
|
|
|
__u32 defchange;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_cbq_xstats {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 borrows;
|
|
|
|
__u32 overactions;
|
|
|
|
__s32 avgidle;
|
|
|
|
__s32 undertime;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2005-04-17 06:20:36 +08:00
|
|
|
TCA_CBQ_UNSPEC,
|
|
|
|
TCA_CBQ_LSSOPT,
|
|
|
|
TCA_CBQ_WRROPT,
|
|
|
|
TCA_CBQ_FOPT,
|
|
|
|
TCA_CBQ_OVL_STRATEGY,
|
|
|
|
TCA_CBQ_RATE,
|
|
|
|
TCA_CBQ_RTAB,
|
|
|
|
TCA_CBQ_POLICE,
|
|
|
|
__TCA_CBQ_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1)
|
|
|
|
|
|
|
|
/* dsmark section */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_DSMARK_UNSPEC,
|
|
|
|
TCA_DSMARK_INDICES,
|
|
|
|
TCA_DSMARK_DEFAULT_INDEX,
|
|
|
|
TCA_DSMARK_SET_TC_INDEX,
|
|
|
|
TCA_DSMARK_MASK,
|
|
|
|
TCA_DSMARK_VALUE,
|
|
|
|
__TCA_DSMARK_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1)
|
|
|
|
|
|
|
|
/* ATM section */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_ATM_UNSPEC,
|
|
|
|
TCA_ATM_FD, /* file/socket descriptor */
|
|
|
|
TCA_ATM_PTR, /* pointer to descriptor - later */
|
|
|
|
TCA_ATM_HDR, /* LL header */
|
|
|
|
TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */
|
|
|
|
TCA_ATM_ADDR, /* PVC address (for output only) */
|
|
|
|
TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */
|
|
|
|
__TCA_ATM_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_ATM_MAX (__TCA_ATM_MAX - 1)
|
|
|
|
|
|
|
|
/* Network emulator */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2005-04-17 06:20:36 +08:00
|
|
|
TCA_NETEM_UNSPEC,
|
|
|
|
TCA_NETEM_CORR,
|
|
|
|
TCA_NETEM_DELAY_DIST,
|
2005-05-27 03:55:48 +08:00
|
|
|
TCA_NETEM_REORDER,
|
2005-12-22 11:03:44 +08:00
|
|
|
TCA_NETEM_CORRUPT,
|
2011-02-23 21:04:21 +08:00
|
|
|
TCA_NETEM_LOSS,
|
2011-11-30 20:20:26 +08:00
|
|
|
TCA_NETEM_RATE,
|
2012-05-01 07:11:05 +08:00
|
|
|
TCA_NETEM_ECN,
|
2013-12-25 17:35:15 +08:00
|
|
|
TCA_NETEM_RATE64,
|
2016-04-25 16:25:15 +08:00
|
|
|
TCA_NETEM_PAD,
|
2017-11-09 07:12:27 +08:00
|
|
|
TCA_NETEM_LATENCY64,
|
|
|
|
TCA_NETEM_JITTER64,
|
netem: support delivering packets in delayed time slots
Slotting is a crude approximation of the behaviors of shared media such
as cable, wifi, and LTE, which gather up a bunch of packets within a
varying delay window and deliver them, relative to that, nearly all at
once.
It works within the existing loss, duplication, jitter and delay
parameters of netem. Some amount of inherent latency must be specified,
regardless.
The new "slot" parameter specifies a minimum and maximum delay between
transmission attempts.
The "bytes" and "packets" parameters can be used to limit the amount of
information transferred per slot.
Examples of use:
tc qdisc add dev eth0 root netem delay 200us \
slot 800us 10ms bytes 64k packets 42
A more correct example, using stacked netem instances and a packet limit
to emulate a tail drop wifi queue with slots and variable packet
delivery, with a 200Mbit isochronous underlying rate, and 20ms path
delay:
tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \
limit 10000
tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \
slot 800us 10ms bytes 64k packets 42 limit 512
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-09 07:12:28 +08:00
|
|
|
TCA_NETEM_SLOT,
|
2018-06-28 01:32:19 +08:00
|
|
|
TCA_NETEM_SLOT_DIST,
|
2005-04-17 06:20:36 +08:00
|
|
|
__TCA_NETEM_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1)
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_netem_qopt {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 latency; /* added delay (us) */
|
|
|
|
__u32 limit; /* fifo limit (packets) */
|
|
|
|
__u32 loss; /* random packet loss (0=none ~0=100%) */
|
2005-05-27 03:55:48 +08:00
|
|
|
__u32 gap; /* re-ordering gap (0 for none) */
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 duplicate; /* random packet dup (0=none ~0=100%) */
|
|
|
|
__u32 jitter; /* random jitter in latency (us) */
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_netem_corr {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 delay_corr; /* delay correlation */
|
|
|
|
__u32 loss_corr; /* packet loss correlation */
|
|
|
|
__u32 dup_corr; /* duplicate correlation */
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_netem_reorder {
|
2005-05-27 03:55:48 +08:00
|
|
|
__u32 probability;
|
|
|
|
__u32 correlation;
|
|
|
|
};
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_netem_corrupt {
|
2005-12-22 11:03:44 +08:00
|
|
|
__u32 probability;
|
|
|
|
__u32 correlation;
|
|
|
|
};
|
|
|
|
|
2011-11-30 20:20:26 +08:00
|
|
|
struct tc_netem_rate {
|
|
|
|
__u32 rate; /* byte/s */
|
2011-12-12 22:30:00 +08:00
|
|
|
__s32 packet_overhead;
|
|
|
|
__u32 cell_size;
|
|
|
|
__s32 cell_overhead;
|
2011-11-30 20:20:26 +08:00
|
|
|
};
|
|
|
|
|
netem: support delivering packets in delayed time slots
Slotting is a crude approximation of the behaviors of shared media such
as cable, wifi, and LTE, which gather up a bunch of packets within a
varying delay window and deliver them, relative to that, nearly all at
once.
It works within the existing loss, duplication, jitter and delay
parameters of netem. Some amount of inherent latency must be specified,
regardless.
The new "slot" parameter specifies a minimum and maximum delay between
transmission attempts.
The "bytes" and "packets" parameters can be used to limit the amount of
information transferred per slot.
Examples of use:
tc qdisc add dev eth0 root netem delay 200us \
slot 800us 10ms bytes 64k packets 42
A more correct example, using stacked netem instances and a packet limit
to emulate a tail drop wifi queue with slots and variable packet
delivery, with a 200Mbit isochronous underlying rate, and 20ms path
delay:
tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \
limit 10000
tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \
slot 800us 10ms bytes 64k packets 42 limit 512
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-09 07:12:28 +08:00
|
|
|
struct tc_netem_slot {
|
|
|
|
__s64 min_delay; /* nsec */
|
|
|
|
__s64 max_delay;
|
|
|
|
__s32 max_packets;
|
|
|
|
__s32 max_bytes;
|
2018-06-28 01:32:19 +08:00
|
|
|
__s64 dist_delay; /* nsec */
|
|
|
|
__s64 dist_jitter; /* nsec */
|
netem: support delivering packets in delayed time slots
Slotting is a crude approximation of the behaviors of shared media such
as cable, wifi, and LTE, which gather up a bunch of packets within a
varying delay window and deliver them, relative to that, nearly all at
once.
It works within the existing loss, duplication, jitter and delay
parameters of netem. Some amount of inherent latency must be specified,
regardless.
The new "slot" parameter specifies a minimum and maximum delay between
transmission attempts.
The "bytes" and "packets" parameters can be used to limit the amount of
information transferred per slot.
Examples of use:
tc qdisc add dev eth0 root netem delay 200us \
slot 800us 10ms bytes 64k packets 42
A more correct example, using stacked netem instances and a packet limit
to emulate a tail drop wifi queue with slots and variable packet
delivery, with a 200Mbit isochronous underlying rate, and 20ms path
delay:
tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \
limit 10000
tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \
slot 800us 10ms bytes 64k packets 42 limit 512
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-09 07:12:28 +08:00
|
|
|
};
|
|
|
|
|
2011-02-23 21:04:21 +08:00
|
|
|
enum {
|
|
|
|
NETEM_LOSS_UNSPEC,
|
|
|
|
NETEM_LOSS_GI, /* General Intuitive - 4 state model */
|
|
|
|
NETEM_LOSS_GE, /* Gilbert Elliot models */
|
|
|
|
__NETEM_LOSS_MAX
|
|
|
|
};
|
|
|
|
#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1)
|
|
|
|
|
2011-11-21 14:53:46 +08:00
|
|
|
/* State transition probabilities for 4 state model */
|
2011-02-23 21:04:21 +08:00
|
|
|
struct tc_netem_gimodel {
|
|
|
|
__u32 p13;
|
|
|
|
__u32 p31;
|
|
|
|
__u32 p32;
|
|
|
|
__u32 p14;
|
|
|
|
__u32 p23;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Gilbert-Elliot models */
|
|
|
|
struct tc_netem_gemodel {
|
|
|
|
__u32 p;
|
|
|
|
__u32 r;
|
|
|
|
__u32 h;
|
|
|
|
__u32 k1;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define NETEM_DIST_SCALE 8192
|
2011-02-23 21:04:19 +08:00
|
|
|
#define NETEM_DIST_MAX 16384
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-11-20 20:10:00 +08:00
|
|
|
/* DRR */
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
enum {
|
2008-11-20 20:10:00 +08:00
|
|
|
TCA_DRR_UNSPEC,
|
|
|
|
TCA_DRR_QUANTUM,
|
|
|
|
__TCA_DRR_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_DRR_MAX (__TCA_DRR_MAX - 1)
|
|
|
|
|
2009-11-05 01:50:58 +08:00
|
|
|
struct tc_drr_stats {
|
2009-02-11 09:18:17 +08:00
|
|
|
__u32 deficit;
|
2008-11-20 20:10:00 +08:00
|
|
|
};
|
|
|
|
|
2011-01-17 16:06:09 +08:00
|
|
|
/* MQPRIO */
|
|
|
|
#define TC_QOPT_BITMASK 15
|
|
|
|
#define TC_QOPT_MAX_QUEUE 16
|
|
|
|
|
2017-03-16 01:39:18 +08:00
|
|
|
enum {
|
|
|
|
TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */
|
|
|
|
TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */
|
|
|
|
__TC_MQPRIO_HW_OFFLOAD_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
|
|
|
|
|
2017-09-07 19:00:06 +08:00
|
|
|
enum {
|
|
|
|
TC_MQPRIO_MODE_DCB,
|
|
|
|
TC_MQPRIO_MODE_CHANNEL,
|
|
|
|
__TC_MQPRIO_MODE_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TC_MQPRIO_SHAPER_DCB,
|
|
|
|
TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */
|
|
|
|
__TC_MQPRIO_SHAPER_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
|
|
|
|
|
2011-01-17 16:06:09 +08:00
|
|
|
struct tc_mqprio_qopt {
|
|
|
|
__u8 num_tc;
|
|
|
|
__u8 prio_tc_map[TC_QOPT_BITMASK + 1];
|
|
|
|
__u8 hw;
|
|
|
|
__u16 count[TC_QOPT_MAX_QUEUE];
|
|
|
|
__u16 offset[TC_QOPT_MAX_QUEUE];
|
|
|
|
};
|
|
|
|
|
2017-09-07 19:00:06 +08:00
|
|
|
#define TC_MQPRIO_F_MODE 0x1
|
|
|
|
#define TC_MQPRIO_F_SHAPER 0x2
|
|
|
|
#define TC_MQPRIO_F_MIN_RATE 0x4
|
|
|
|
#define TC_MQPRIO_F_MAX_RATE 0x8
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_MQPRIO_UNSPEC,
|
|
|
|
TCA_MQPRIO_MODE,
|
|
|
|
TCA_MQPRIO_SHAPER,
|
|
|
|
TCA_MQPRIO_MIN_RATE64,
|
|
|
|
TCA_MQPRIO_MAX_RATE64,
|
|
|
|
__TCA_MQPRIO_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
|
|
|
|
|
net_sched: SFB flow scheduler
This is the Stochastic Fair Blue scheduler, based on work from :
W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: A New Class of Active Queue
Management Algorithms. U. Michigan CSE-TR-387-99, April 1999.
http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
This implementation is based on work done by Juliusz Chroboczek
General SFB algorithm can be found in figure 14, page 15:
B[l][n] : L x N array of bins (L levels, N bins per level)
enqueue()
Calculate hash function values h{0}, h{1}, .. h{L-1}
Update bins at each level
for i = 0 to L - 1
if (B[i][h{i}].qlen > bin_size)
B[i][h{i}].p_mark += p_increment;
else if (B[i][h{i}].qlen == 0)
B[i][h{i}].p_mark -= p_decrement;
p_min = min(B[0][h{0}].p_mark ... B[L-1][h{L-1}].p_mark);
if (p_min == 1.0)
ratelimit();
else
mark/drop with probabilty p_min;
I did the adaptation of Juliusz code to meet current kernel standards,
and various changes to address previous comments :
http://thread.gmane.org/gmane.linux.network/90225
http://thread.gmane.org/gmane.linux.network/90375
Default flow classifier is the rxhash introduced by RPS in 2.6.35, but
we can use an external flow classifier if wanted.
tc qdisc add dev $DEV parent 1:11 handle 11: \
est 0.5sec 2sec sfb limit 128
tc filter add dev $DEV protocol ip parent 11: handle 3 \
flow hash keys dst divisor 1024
Notes:
1) SFB default child qdisc is pfifo_fast. It can be changed by another
qdisc but a child qdisc MUST not drop a packet previously queued. This
is because SFB needs to handle a dequeued packet in order to maintain
its virtual queue states. pfifo_head_drop or CHOKe should not be used.
2) ECN is enabled by default, unlike RED/CHOKe/GRED
With help from Patrick McHardy & Andi Kleen
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Juliusz Chroboczek <Juliusz.Chroboczek@pps.jussieu.fr>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-02-23 18:56:17 +08:00
|
|
|
/* SFB */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_SFB_UNSPEC,
|
|
|
|
TCA_SFB_PARMS,
|
|
|
|
__TCA_SFB_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_SFB_MAX (__TCA_SFB_MAX - 1)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: increment, decrement are Q0.16 fixed-point values.
|
|
|
|
*/
|
|
|
|
struct tc_sfb_qopt {
|
|
|
|
__u32 rehash_interval; /* delay between hash move, in ms */
|
|
|
|
__u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */
|
|
|
|
__u32 max; /* max len of qlen_min */
|
|
|
|
__u32 bin_size; /* maximum queue length per bin */
|
|
|
|
__u32 increment; /* probability increment, (d1 in Blue) */
|
|
|
|
__u32 decrement; /* probability decrement, (d2 in Blue) */
|
|
|
|
__u32 limit; /* max SFB queue length */
|
|
|
|
__u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */
|
|
|
|
__u32 penalty_burst;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tc_sfb_xstats {
|
|
|
|
__u32 earlydrop;
|
|
|
|
__u32 penaltydrop;
|
|
|
|
__u32 bucketdrop;
|
|
|
|
__u32 queuedrop;
|
|
|
|
__u32 childdrop; /* drops in child qdisc */
|
|
|
|
__u32 marked;
|
|
|
|
__u32 maxqlen;
|
|
|
|
__u32 maxprob;
|
|
|
|
__u32 avgprob;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SFB_MAX_PROB 0xFFFF
|
|
|
|
|
2011-04-04 13:30:58 +08:00
|
|
|
/* QFQ */
|
|
|
|
enum {
|
|
|
|
TCA_QFQ_UNSPEC,
|
|
|
|
TCA_QFQ_WEIGHT,
|
|
|
|
TCA_QFQ_LMAX,
|
|
|
|
__TCA_QFQ_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1)
|
|
|
|
|
|
|
|
struct tc_qfq_stats {
|
|
|
|
__u32 weight;
|
|
|
|
__u32 lmax;
|
|
|
|
};
|
|
|
|
|
codel: Controlled Delay AQM
An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson.
http://queue.acm.org/detail.cfm?id=2209336
This AQM main input is no longer queue size in bytes or packets, but the
delay packets stay in (FIFO) queue.
As we don't have infinite memory, we still can drop packets in enqueue()
in case of massive load, but mean of CoDel is to drop packets in
dequeue(), using a control law based on two simple parameters :
target : target sojourn time (default 5ms)
interval : width of moving time window (default 100ms)
Based on initial work from Dave Taht.
Refactored to help future codel inclusion as a plugin for other linux
qdisc (FQ_CODEL, ...), like RED.
include/net/codel.h contains codel algorithm as close as possible than
Kathleen reference.
net/sched/sch_codel.c contains the linux qdisc specific glue.
Separate structures permit a memory efficient implementation of fq_codel
(to be sent as a separate work) : Each flow has its own struct
codel_vars.
timestamps are taken at enqueue() time with 1024 ns precision, allowing
a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses
usec as base unit.
Selected packets are dropped, unless ECN is enabled and packets can get
ECN mark instead.
Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and
tg3 drivers (BQL enabled).
Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ]
[ interval TIME ] [ ecn ]
qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn
Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0)
rate 202365Kbit 16708pps backlog 113550b 75p requeues 0
count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us
maxpacket 1514 ecn_mark 84399 drop_overlimit 0
CoDel must be seen as a base module, and should be used keeping in mind
there is still a FIFO queue. So a typical setup will probably need a
hierarchy of several qdiscs and packet classifiers to be able to meet
whatever constraints a user might have.
One possible example would be to use fq_codel, which combines Fair
Queueing and CoDel, in replacement of sfq / sfq_red.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Dave Taht <dave.taht@bufferbloat.net>
Cc: Kathleen Nichols <nichols@pollere.com>
Cc: Van Jacobson <van@pollere.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 15:51:25 +08:00
|
|
|
/* CODEL */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_CODEL_UNSPEC,
|
|
|
|
TCA_CODEL_TARGET,
|
|
|
|
TCA_CODEL_LIMIT,
|
|
|
|
TCA_CODEL_INTERVAL,
|
|
|
|
TCA_CODEL_ECN,
|
codel: add ce_threshold attribute
For DCTCP or similar ECN based deployments on fabrics with shallow
buffers, hosts are responsible for a good part of the buffering.
This patch adds an optional ce_threshold to codel & fq_codel qdiscs,
so that DCTCP can have feedback from queuing in the host.
A DCTCP enabled egress port simply have a queue occupancy threshold
above which ECT packets get CE mark.
In codel language this translates to a sojourn time, so that one doesn't
have to worry about bytes or bandwidth but delays.
This makes the host an active participant in the health of the whole
network.
This also helps experimenting DCTCP in a setup without DCTCP compliant
fabric.
On following example, ce_threshold is set to 1ms, and we can see from
'ldelay xxx us' that TCP is not trying to go around the 5ms codel
target.
Queue has more capacity to absorb inelastic bursts (say from UDP
traffic), as queues are maintained to an optimal level.
lpaa23:~# ./tc -s -d qd sh dev eth1
qdisc mq 1: dev eth1 root
Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961)
backlog 3108242b 364p requeues 42961
qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503)
rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503
count 0 lastcount 0 ldelay 1.0ms drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384
qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186)
rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186
count 0 lastcount 0 ldelay 694us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873
qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554)
rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554
count 0 lastcount 0 ldelay 889us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780
...
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-09 06:05:12 +08:00
|
|
|
TCA_CODEL_CE_THRESHOLD,
|
codel: Controlled Delay AQM
An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson.
http://queue.acm.org/detail.cfm?id=2209336
This AQM main input is no longer queue size in bytes or packets, but the
delay packets stay in (FIFO) queue.
As we don't have infinite memory, we still can drop packets in enqueue()
in case of massive load, but mean of CoDel is to drop packets in
dequeue(), using a control law based on two simple parameters :
target : target sojourn time (default 5ms)
interval : width of moving time window (default 100ms)
Based on initial work from Dave Taht.
Refactored to help future codel inclusion as a plugin for other linux
qdisc (FQ_CODEL, ...), like RED.
include/net/codel.h contains codel algorithm as close as possible than
Kathleen reference.
net/sched/sch_codel.c contains the linux qdisc specific glue.
Separate structures permit a memory efficient implementation of fq_codel
(to be sent as a separate work) : Each flow has its own struct
codel_vars.
timestamps are taken at enqueue() time with 1024 ns precision, allowing
a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses
usec as base unit.
Selected packets are dropped, unless ECN is enabled and packets can get
ECN mark instead.
Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and
tg3 drivers (BQL enabled).
Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ]
[ interval TIME ] [ ecn ]
qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn
Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0)
rate 202365Kbit 16708pps backlog 113550b 75p requeues 0
count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us
maxpacket 1514 ecn_mark 84399 drop_overlimit 0
CoDel must be seen as a base module, and should be used keeping in mind
there is still a FIFO queue. So a typical setup will probably need a
hierarchy of several qdiscs and packet classifiers to be able to meet
whatever constraints a user might have.
One possible example would be to use fq_codel, which combines Fair
Queueing and CoDel, in replacement of sfq / sfq_red.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Dave Taht <dave.taht@bufferbloat.net>
Cc: Kathleen Nichols <nichols@pollere.com>
Cc: Van Jacobson <van@pollere.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 15:51:25 +08:00
|
|
|
__TCA_CODEL_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1)
|
|
|
|
|
|
|
|
struct tc_codel_xstats {
|
|
|
|
__u32 maxpacket; /* largest packet we've seen so far */
|
|
|
|
__u32 count; /* how many drops we've done since the last time we
|
|
|
|
* entered dropping state
|
|
|
|
*/
|
|
|
|
__u32 lastcount; /* count at entry to dropping state */
|
|
|
|
__u32 ldelay; /* in-queue delay seen by most recently dequeued packet */
|
|
|
|
__s32 drop_next; /* time to drop next packet */
|
|
|
|
__u32 drop_overlimit; /* number of time max qdisc packet limit was hit */
|
|
|
|
__u32 ecn_mark; /* number of packets we ECN marked instead of dropped */
|
|
|
|
__u32 dropping; /* are we in dropping state ? */
|
codel: add ce_threshold attribute
For DCTCP or similar ECN based deployments on fabrics with shallow
buffers, hosts are responsible for a good part of the buffering.
This patch adds an optional ce_threshold to codel & fq_codel qdiscs,
so that DCTCP can have feedback from queuing in the host.
A DCTCP enabled egress port simply have a queue occupancy threshold
above which ECT packets get CE mark.
In codel language this translates to a sojourn time, so that one doesn't
have to worry about bytes or bandwidth but delays.
This makes the host an active participant in the health of the whole
network.
This also helps experimenting DCTCP in a setup without DCTCP compliant
fabric.
On following example, ce_threshold is set to 1ms, and we can see from
'ldelay xxx us' that TCP is not trying to go around the 5ms codel
target.
Queue has more capacity to absorb inelastic bursts (say from UDP
traffic), as queues are maintained to an optimal level.
lpaa23:~# ./tc -s -d qd sh dev eth1
qdisc mq 1: dev eth1 root
Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961)
backlog 3108242b 364p requeues 42961
qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503)
rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503
count 0 lastcount 0 ldelay 1.0ms drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384
qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186)
rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186
count 0 lastcount 0 ldelay 694us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873
qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554)
rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554
count 0 lastcount 0 ldelay 889us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780
...
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-09 06:05:12 +08:00
|
|
|
__u32 ce_mark; /* number of CE marked packets because of ce_threshold */
|
codel: Controlled Delay AQM
An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson.
http://queue.acm.org/detail.cfm?id=2209336
This AQM main input is no longer queue size in bytes or packets, but the
delay packets stay in (FIFO) queue.
As we don't have infinite memory, we still can drop packets in enqueue()
in case of massive load, but mean of CoDel is to drop packets in
dequeue(), using a control law based on two simple parameters :
target : target sojourn time (default 5ms)
interval : width of moving time window (default 100ms)
Based on initial work from Dave Taht.
Refactored to help future codel inclusion as a plugin for other linux
qdisc (FQ_CODEL, ...), like RED.
include/net/codel.h contains codel algorithm as close as possible than
Kathleen reference.
net/sched/sch_codel.c contains the linux qdisc specific glue.
Separate structures permit a memory efficient implementation of fq_codel
(to be sent as a separate work) : Each flow has its own struct
codel_vars.
timestamps are taken at enqueue() time with 1024 ns precision, allowing
a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses
usec as base unit.
Selected packets are dropped, unless ECN is enabled and packets can get
ECN mark instead.
Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and
tg3 drivers (BQL enabled).
Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ]
[ interval TIME ] [ ecn ]
qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn
Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0)
rate 202365Kbit 16708pps backlog 113550b 75p requeues 0
count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us
maxpacket 1514 ecn_mark 84399 drop_overlimit 0
CoDel must be seen as a base module, and should be used keeping in mind
there is still a FIFO queue. So a typical setup will probably need a
hierarchy of several qdiscs and packet classifiers to be able to meet
whatever constraints a user might have.
One possible example would be to use fq_codel, which combines Fair
Queueing and CoDel, in replacement of sfq / sfq_red.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Dave Taht <dave.taht@bufferbloat.net>
Cc: Kathleen Nichols <nichols@pollere.com>
Cc: Van Jacobson <van@pollere.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 15:51:25 +08:00
|
|
|
};
|
|
|
|
|
fq_codel: Fair Queue Codel AQM
Fair Queue Codel packet scheduler
Principles :
- Packets are classified (internal classifier or external) on flows.
- This is a Stochastic model (as we use a hash, several flows might
be hashed on same slot)
- Each flow has a CoDel managed queue.
- Flows are linked onto two (Round Robin) lists,
so that new flows have priority on old ones.
- For a given flow, packets are not reordered (CoDel uses a FIFO)
- head drops only.
- ECN capability is on by default.
- Very low memory footprint (64 bytes per flow)
tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ]
[ target TIME ] [ interval TIME ] [ noecn ]
[ quantum BYTES ]
defaults : 1024 flows, 10240 packets limit, quantum : device MTU
target : 5ms (CoDel default)
interval : 100ms (CoDel default)
Impressive results on load :
class htb 1:1 root leaf 10: prio 0 quantum 1514 rate 200000Kbit ceil 200000Kbit burst 1475b/8 mpu 0b overhead 0b cburst 1475b/8 mpu 0b overhead 0b level 0
Sent 43304920109 bytes 33063109 pkt (dropped 0, overlimits 0 requeues 0)
rate 201691Kbit 28595pps backlog 0b 312p requeues 0
lended: 33063109 borrowed: 0 giants: 0
tokens: -912 ctokens: -912
class fq_codel 10:1735 parent 10:
(dropped 1292, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4524 parent 10:
(dropped 1291, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4e74 parent 10:
(dropped 1290, overlimits 0 requeues 0)
backlog 6056b 4p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 6.4ms dropping drop_next 92.0ms
class fq_codel 10:628a parent 10:
(dropped 1289, overlimits 0 requeues 0)
backlog 7570b 5p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.4ms dropping drop_next 90.9ms
class fq_codel 10:a4b3 parent 10:
(dropped 302, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:c3c2 parent 10:
(dropped 1284, overlimits 0 requeues 0)
backlog 13626b 9p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:d331 parent 10:
(dropped 299, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.0ms
class fq_codel 10:d526 parent 10:
(dropped 12160, overlimits 0 requeues 0)
backlog 35870b 211p requeues 0
deficit 1508 count 12160 lastcount 1 ldelay 15.3ms dropping drop_next 247us
class fq_codel 10:e2c6 parent 10:
(dropped 1288, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:eab5 parent 10:
(dropped 1285, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:f220 parent 10:
(dropped 1289, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
qdisc htb 1: root refcnt 6 r2q 10 default 1 direct_packets_stat 0 ver 3.17
Sent 43331086547 bytes 33092812 pkt (dropped 0, overlimits 66063544 requeues 71)
rate 201697Kbit 28602pps backlog 0b 260p requeues 71
qdisc fq_codel 10: parent 1:1 limit 10240p flows 65536 target 5.0ms interval 100.0ms ecn
Sent 43331086547 bytes 33092812 pkt (dropped 949359, overlimits 0 requeues 0)
rate 201697Kbit 28602pps backlog 189352b 260p requeues 0
maxpacket 1514 drop_overlimit 0 new_flow_count 5582 ecn_mark 125593
new_flows_len 0 old_flows_len 11
PING 172.30.42.18 (172.30.42.18) 56(84) bytes of data.
64 bytes from 172.30.42.18: icmp_req=1 ttl=64 time=0.227 ms
64 bytes from 172.30.42.18: icmp_req=2 ttl=64 time=0.165 ms
64 bytes from 172.30.42.18: icmp_req=3 ttl=64 time=0.166 ms
64 bytes from 172.30.42.18: icmp_req=4 ttl=64 time=0.151 ms
64 bytes from 172.30.42.18: icmp_req=5 ttl=64 time=0.164 ms
64 bytes from 172.30.42.18: icmp_req=6 ttl=64 time=0.172 ms
64 bytes from 172.30.42.18: icmp_req=7 ttl=64 time=0.175 ms
64 bytes from 172.30.42.18: icmp_req=8 ttl=64 time=0.183 ms
64 bytes from 172.30.42.18: icmp_req=9 ttl=64 time=0.158 ms
64 bytes from 172.30.42.18: icmp_req=10 ttl=64 time=0.200 ms
10 packets transmitted, 10 received, 0% packet loss, time 8999ms
rtt min/avg/max/mdev = 0.151/0.176/0.227/0.022 ms
Much better than SFQ because of priority given to new flows, and fast
path dirtying less cache lines.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-11 17:30:50 +08:00
|
|
|
/* FQ_CODEL */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_FQ_CODEL_UNSPEC,
|
|
|
|
TCA_FQ_CODEL_TARGET,
|
|
|
|
TCA_FQ_CODEL_LIMIT,
|
|
|
|
TCA_FQ_CODEL_INTERVAL,
|
|
|
|
TCA_FQ_CODEL_ECN,
|
|
|
|
TCA_FQ_CODEL_FLOWS,
|
|
|
|
TCA_FQ_CODEL_QUANTUM,
|
codel: add ce_threshold attribute
For DCTCP or similar ECN based deployments on fabrics with shallow
buffers, hosts are responsible for a good part of the buffering.
This patch adds an optional ce_threshold to codel & fq_codel qdiscs,
so that DCTCP can have feedback from queuing in the host.
A DCTCP enabled egress port simply have a queue occupancy threshold
above which ECT packets get CE mark.
In codel language this translates to a sojourn time, so that one doesn't
have to worry about bytes or bandwidth but delays.
This makes the host an active participant in the health of the whole
network.
This also helps experimenting DCTCP in a setup without DCTCP compliant
fabric.
On following example, ce_threshold is set to 1ms, and we can see from
'ldelay xxx us' that TCP is not trying to go around the 5ms codel
target.
Queue has more capacity to absorb inelastic bursts (say from UDP
traffic), as queues are maintained to an optimal level.
lpaa23:~# ./tc -s -d qd sh dev eth1
qdisc mq 1: dev eth1 root
Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961)
backlog 3108242b 364p requeues 42961
qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503)
rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503
count 0 lastcount 0 ldelay 1.0ms drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384
qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186)
rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186
count 0 lastcount 0 ldelay 694us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873
qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554)
rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554
count 0 lastcount 0 ldelay 889us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780
...
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-09 06:05:12 +08:00
|
|
|
TCA_FQ_CODEL_CE_THRESHOLD,
|
2016-05-02 07:47:26 +08:00
|
|
|
TCA_FQ_CODEL_DROP_BATCH_SIZE,
|
2016-05-06 23:55:12 +08:00
|
|
|
TCA_FQ_CODEL_MEMORY_LIMIT,
|
fq_codel: Fair Queue Codel AQM
Fair Queue Codel packet scheduler
Principles :
- Packets are classified (internal classifier or external) on flows.
- This is a Stochastic model (as we use a hash, several flows might
be hashed on same slot)
- Each flow has a CoDel managed queue.
- Flows are linked onto two (Round Robin) lists,
so that new flows have priority on old ones.
- For a given flow, packets are not reordered (CoDel uses a FIFO)
- head drops only.
- ECN capability is on by default.
- Very low memory footprint (64 bytes per flow)
tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ]
[ target TIME ] [ interval TIME ] [ noecn ]
[ quantum BYTES ]
defaults : 1024 flows, 10240 packets limit, quantum : device MTU
target : 5ms (CoDel default)
interval : 100ms (CoDel default)
Impressive results on load :
class htb 1:1 root leaf 10: prio 0 quantum 1514 rate 200000Kbit ceil 200000Kbit burst 1475b/8 mpu 0b overhead 0b cburst 1475b/8 mpu 0b overhead 0b level 0
Sent 43304920109 bytes 33063109 pkt (dropped 0, overlimits 0 requeues 0)
rate 201691Kbit 28595pps backlog 0b 312p requeues 0
lended: 33063109 borrowed: 0 giants: 0
tokens: -912 ctokens: -912
class fq_codel 10:1735 parent 10:
(dropped 1292, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4524 parent 10:
(dropped 1291, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4e74 parent 10:
(dropped 1290, overlimits 0 requeues 0)
backlog 6056b 4p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 6.4ms dropping drop_next 92.0ms
class fq_codel 10:628a parent 10:
(dropped 1289, overlimits 0 requeues 0)
backlog 7570b 5p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.4ms dropping drop_next 90.9ms
class fq_codel 10:a4b3 parent 10:
(dropped 302, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:c3c2 parent 10:
(dropped 1284, overlimits 0 requeues 0)
backlog 13626b 9p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:d331 parent 10:
(dropped 299, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.0ms
class fq_codel 10:d526 parent 10:
(dropped 12160, overlimits 0 requeues 0)
backlog 35870b 211p requeues 0
deficit 1508 count 12160 lastcount 1 ldelay 15.3ms dropping drop_next 247us
class fq_codel 10:e2c6 parent 10:
(dropped 1288, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:eab5 parent 10:
(dropped 1285, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:f220 parent 10:
(dropped 1289, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
qdisc htb 1: root refcnt 6 r2q 10 default 1 direct_packets_stat 0 ver 3.17
Sent 43331086547 bytes 33092812 pkt (dropped 0, overlimits 66063544 requeues 71)
rate 201697Kbit 28602pps backlog 0b 260p requeues 71
qdisc fq_codel 10: parent 1:1 limit 10240p flows 65536 target 5.0ms interval 100.0ms ecn
Sent 43331086547 bytes 33092812 pkt (dropped 949359, overlimits 0 requeues 0)
rate 201697Kbit 28602pps backlog 189352b 260p requeues 0
maxpacket 1514 drop_overlimit 0 new_flow_count 5582 ecn_mark 125593
new_flows_len 0 old_flows_len 11
PING 172.30.42.18 (172.30.42.18) 56(84) bytes of data.
64 bytes from 172.30.42.18: icmp_req=1 ttl=64 time=0.227 ms
64 bytes from 172.30.42.18: icmp_req=2 ttl=64 time=0.165 ms
64 bytes from 172.30.42.18: icmp_req=3 ttl=64 time=0.166 ms
64 bytes from 172.30.42.18: icmp_req=4 ttl=64 time=0.151 ms
64 bytes from 172.30.42.18: icmp_req=5 ttl=64 time=0.164 ms
64 bytes from 172.30.42.18: icmp_req=6 ttl=64 time=0.172 ms
64 bytes from 172.30.42.18: icmp_req=7 ttl=64 time=0.175 ms
64 bytes from 172.30.42.18: icmp_req=8 ttl=64 time=0.183 ms
64 bytes from 172.30.42.18: icmp_req=9 ttl=64 time=0.158 ms
64 bytes from 172.30.42.18: icmp_req=10 ttl=64 time=0.200 ms
10 packets transmitted, 10 received, 0% packet loss, time 8999ms
rtt min/avg/max/mdev = 0.151/0.176/0.227/0.022 ms
Much better than SFQ because of priority given to new flows, and fast
path dirtying less cache lines.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-11 17:30:50 +08:00
|
|
|
__TCA_FQ_CODEL_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_FQ_CODEL_XSTATS_QDISC,
|
|
|
|
TCA_FQ_CODEL_XSTATS_CLASS,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tc_fq_codel_qd_stats {
|
|
|
|
__u32 maxpacket; /* largest packet we've seen so far */
|
|
|
|
__u32 drop_overlimit; /* number of time max qdisc
|
|
|
|
* packet limit was hit
|
|
|
|
*/
|
|
|
|
__u32 ecn_mark; /* number of packets we ECN marked
|
|
|
|
* instead of being dropped
|
|
|
|
*/
|
|
|
|
__u32 new_flow_count; /* number of time packets
|
|
|
|
* created a 'new flow'
|
|
|
|
*/
|
|
|
|
__u32 new_flows_len; /* count of flows in new list */
|
|
|
|
__u32 old_flows_len; /* count of flows in old list */
|
codel: add ce_threshold attribute
For DCTCP or similar ECN based deployments on fabrics with shallow
buffers, hosts are responsible for a good part of the buffering.
This patch adds an optional ce_threshold to codel & fq_codel qdiscs,
so that DCTCP can have feedback from queuing in the host.
A DCTCP enabled egress port simply have a queue occupancy threshold
above which ECT packets get CE mark.
In codel language this translates to a sojourn time, so that one doesn't
have to worry about bytes or bandwidth but delays.
This makes the host an active participant in the health of the whole
network.
This also helps experimenting DCTCP in a setup without DCTCP compliant
fabric.
On following example, ce_threshold is set to 1ms, and we can see from
'ldelay xxx us' that TCP is not trying to go around the 5ms codel
target.
Queue has more capacity to absorb inelastic bursts (say from UDP
traffic), as queues are maintained to an optimal level.
lpaa23:~# ./tc -s -d qd sh dev eth1
qdisc mq 1: dev eth1 root
Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961)
backlog 3108242b 364p requeues 42961
qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503)
rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503
count 0 lastcount 0 ldelay 1.0ms drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384
qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186)
rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186
count 0 lastcount 0 ldelay 694us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873
qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554)
rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554
count 0 lastcount 0 ldelay 889us drop_next 0us
maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780
...
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-09 06:05:12 +08:00
|
|
|
__u32 ce_mark; /* packets above ce_threshold */
|
2016-05-06 23:55:12 +08:00
|
|
|
__u32 memory_usage; /* in bytes */
|
|
|
|
__u32 drop_overmemory;
|
fq_codel: Fair Queue Codel AQM
Fair Queue Codel packet scheduler
Principles :
- Packets are classified (internal classifier or external) on flows.
- This is a Stochastic model (as we use a hash, several flows might
be hashed on same slot)
- Each flow has a CoDel managed queue.
- Flows are linked onto two (Round Robin) lists,
so that new flows have priority on old ones.
- For a given flow, packets are not reordered (CoDel uses a FIFO)
- head drops only.
- ECN capability is on by default.
- Very low memory footprint (64 bytes per flow)
tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ]
[ target TIME ] [ interval TIME ] [ noecn ]
[ quantum BYTES ]
defaults : 1024 flows, 10240 packets limit, quantum : device MTU
target : 5ms (CoDel default)
interval : 100ms (CoDel default)
Impressive results on load :
class htb 1:1 root leaf 10: prio 0 quantum 1514 rate 200000Kbit ceil 200000Kbit burst 1475b/8 mpu 0b overhead 0b cburst 1475b/8 mpu 0b overhead 0b level 0
Sent 43304920109 bytes 33063109 pkt (dropped 0, overlimits 0 requeues 0)
rate 201691Kbit 28595pps backlog 0b 312p requeues 0
lended: 33063109 borrowed: 0 giants: 0
tokens: -912 ctokens: -912
class fq_codel 10:1735 parent 10:
(dropped 1292, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4524 parent 10:
(dropped 1291, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4e74 parent 10:
(dropped 1290, overlimits 0 requeues 0)
backlog 6056b 4p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 6.4ms dropping drop_next 92.0ms
class fq_codel 10:628a parent 10:
(dropped 1289, overlimits 0 requeues 0)
backlog 7570b 5p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.4ms dropping drop_next 90.9ms
class fq_codel 10:a4b3 parent 10:
(dropped 302, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:c3c2 parent 10:
(dropped 1284, overlimits 0 requeues 0)
backlog 13626b 9p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:d331 parent 10:
(dropped 299, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.0ms
class fq_codel 10:d526 parent 10:
(dropped 12160, overlimits 0 requeues 0)
backlog 35870b 211p requeues 0
deficit 1508 count 12160 lastcount 1 ldelay 15.3ms dropping drop_next 247us
class fq_codel 10:e2c6 parent 10:
(dropped 1288, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:eab5 parent 10:
(dropped 1285, overlimits 0 requeues 0)
backlog 16654b 11p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:f220 parent 10:
(dropped 1289, overlimits 0 requeues 0)
backlog 15140b 10p requeues 0
deficit 1514 count 1 lastcount 1 ldelay 7.1ms
qdisc htb 1: root refcnt 6 r2q 10 default 1 direct_packets_stat 0 ver 3.17
Sent 43331086547 bytes 33092812 pkt (dropped 0, overlimits 66063544 requeues 71)
rate 201697Kbit 28602pps backlog 0b 260p requeues 71
qdisc fq_codel 10: parent 1:1 limit 10240p flows 65536 target 5.0ms interval 100.0ms ecn
Sent 43331086547 bytes 33092812 pkt (dropped 949359, overlimits 0 requeues 0)
rate 201697Kbit 28602pps backlog 189352b 260p requeues 0
maxpacket 1514 drop_overlimit 0 new_flow_count 5582 ecn_mark 125593
new_flows_len 0 old_flows_len 11
PING 172.30.42.18 (172.30.42.18) 56(84) bytes of data.
64 bytes from 172.30.42.18: icmp_req=1 ttl=64 time=0.227 ms
64 bytes from 172.30.42.18: icmp_req=2 ttl=64 time=0.165 ms
64 bytes from 172.30.42.18: icmp_req=3 ttl=64 time=0.166 ms
64 bytes from 172.30.42.18: icmp_req=4 ttl=64 time=0.151 ms
64 bytes from 172.30.42.18: icmp_req=5 ttl=64 time=0.164 ms
64 bytes from 172.30.42.18: icmp_req=6 ttl=64 time=0.172 ms
64 bytes from 172.30.42.18: icmp_req=7 ttl=64 time=0.175 ms
64 bytes from 172.30.42.18: icmp_req=8 ttl=64 time=0.183 ms
64 bytes from 172.30.42.18: icmp_req=9 ttl=64 time=0.158 ms
64 bytes from 172.30.42.18: icmp_req=10 ttl=64 time=0.200 ms
10 packets transmitted, 10 received, 0% packet loss, time 8999ms
rtt min/avg/max/mdev = 0.151/0.176/0.227/0.022 ms
Much better than SFQ because of priority given to new flows, and fast
path dirtying less cache lines.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-11 17:30:50 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct tc_fq_codel_cl_stats {
|
|
|
|
__s32 deficit;
|
|
|
|
__u32 ldelay; /* in-queue delay seen by most recently
|
|
|
|
* dequeued packet
|
|
|
|
*/
|
|
|
|
__u32 count;
|
|
|
|
__u32 lastcount;
|
|
|
|
__u32 dropping;
|
|
|
|
__s32 drop_next;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tc_fq_codel_xstats {
|
|
|
|
__u32 type;
|
|
|
|
union {
|
|
|
|
struct tc_fq_codel_qd_stats qdisc_stats;
|
|
|
|
struct tc_fq_codel_cl_stats class_stats;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
to add per socket limitation.
Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.
TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.
This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.
Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit
cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)
15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.
In slow start or steady state, very few packets are throttled [1]
FQ gets a bunch of tunables as :
limit : max number of packets on whole Qdisc (default 10000)
flow_limit : max number of packets per flow (default 100)
quantum : the credit per RR round (default is 2 MTU)
initial_quantum : initial credit for new flows (default is 10 MTU)
maxrate : max per flow rate (default : unlimited)
buckets : number of RB trees (default : 1024) in hash table.
(consumes 8 bytes per bucket)
[no]pacing : disable/enable pacing (default is enable)
All of them can be changed on a live qdisc.
$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
[ quantum BYTES ] [ initial_quantum BYTES ]
[ maxrate RATE ] [ buckets NUMBER ]
[ [no]pacing ]
$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
backlog 0b 0p requeues 14
511 flows, 511 inactive, 0 throttled
110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit
[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-30 06:49:55 +08:00
|
|
|
/* FQ */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_FQ_UNSPEC,
|
|
|
|
|
|
|
|
TCA_FQ_PLIMIT, /* limit of total number of packets in queue */
|
|
|
|
|
|
|
|
TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */
|
|
|
|
|
|
|
|
TCA_FQ_QUANTUM, /* RR quantum */
|
|
|
|
|
|
|
|
TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */
|
|
|
|
|
|
|
|
TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */
|
|
|
|
|
2013-11-16 00:57:26 +08:00
|
|
|
TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */
|
pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
to add per socket limitation.
Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.
TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.
This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.
Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit
cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)
15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.
In slow start or steady state, very few packets are throttled [1]
FQ gets a bunch of tunables as :
limit : max number of packets on whole Qdisc (default 10000)
flow_limit : max number of packets per flow (default 100)
quantum : the credit per RR round (default is 2 MTU)
initial_quantum : initial credit for new flows (default is 10 MTU)
maxrate : max per flow rate (default : unlimited)
buckets : number of RB trees (default : 1024) in hash table.
(consumes 8 bytes per bucket)
[no]pacing : disable/enable pacing (default is enable)
All of them can be changed on a live qdisc.
$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
[ quantum BYTES ] [ initial_quantum BYTES ]
[ maxrate RATE ] [ buckets NUMBER ]
[ [no]pacing ]
$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
backlog 0b 0p requeues 14
511 flows, 511 inactive, 0 throttled
110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit
[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-30 06:49:55 +08:00
|
|
|
|
|
|
|
TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */
|
|
|
|
|
|
|
|
TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */
|
2013-11-16 00:58:14 +08:00
|
|
|
|
|
|
|
TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */
|
|
|
|
|
pkt_sched: fq: better control of DDOS traffic
FQ has a fast path for skb attached to a socket, as it does not
have to compute a flow hash. But for other packets, FQ being non
stochastic means that hosts exposed to random Internet traffic
can allocate million of flows structure (104 bytes each) pretty
easily. Not only host can OOM, but lookup in RB trees can take
too much cpu and memory resources.
This patch adds a new attribute, orphan_mask, that is adding
possibility of having a stochastic hash for orphaned skb.
Its default value is 1024 slots, to mimic SFQ behavior.
Note: This does not apply to locally generated TCP traffic,
and no locally generated traffic will share a flow structure
with another perfect or stochastic flow.
This patch also handles the specific case of SYNACK messages:
They are attached to the listener socket, and therefore all map
to a single hash bucket. If listener have set SO_MAX_PACING_RATE,
hoping to have new accepted socket inherit this rate, SYNACK
might be paced and even dropped.
This is very similar to an internal patch Google have used more
than one year.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 13:30:40 +08:00
|
|
|
TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */
|
|
|
|
|
2016-09-20 11:39:11 +08:00
|
|
|
TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
|
|
|
|
|
2018-11-12 01:11:31 +08:00
|
|
|
TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */
|
|
|
|
|
net_sched: sch_fq: enable use of hrtimer slack
Add a new attribute to control the fq qdisc hrtimer slack.
Default is set to 10 usec.
When/if packets are throttled, fq set up an hrtimer that can
lead to one interrupt per packet in the throttled queue.
By using a timer slack, we allow better use of timer interrupts,
by giving them a chance to call multiple timer callbacks
at each hardware interrupt.
Also, giving a slack allows FQ to dequeue batches of packets
instead of a single one, thus increasing xmit_more efficiency.
This has no negative effect on the rate a TCP flow can sustain,
since each TCP flow maintains its own precise vtime (tp->tcp_wstamp_ns)
v2: added strict netlink checking (as feedback from Jakub Kicinski)
Tested:
1000 concurrent flows all using paced packets.
1,000,000 packets sent per second.
Before the patch :
$ vmstat 2 10
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 60726784 23628 3485992 0 0 138 1 977 535 0 12 87 0 0
0 0 0 60714700 23628 3485628 0 0 0 0 1568827 26462 0 22 78 0 0
1 0 0 60716012 23628 3485656 0 0 0 0 1570034 26216 0 22 78 0 0
0 0 0 60722420 23628 3485492 0 0 0 0 1567230 26424 0 22 78 0 0
0 0 0 60727484 23628 3485556 0 0 0 0 1568220 26200 0 22 78 0 0
2 0 0 60718900 23628 3485380 0 0 0 40 1564721 26630 0 22 78 0 0
2 0 0 60718096 23628 3485332 0 0 0 0 1562593 26432 0 22 78 0 0
0 0 0 60719608 23628 3485064 0 0 0 0 1563806 26238 0 22 78 0 0
1 0 0 60722876 23628 3485236 0 0 0 130 1565874 26566 0 22 78 0 0
1 0 0 60722752 23628 3484908 0 0 0 0 1567646 26247 0 22 78 0 0
After the patch, slack of 10 usec, we can see a reduction of interrupts
per second, and a small decrease of reported cpu usage.
$ vmstat 2 10
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
r b swpd free buff cache si so bi bo in cs us sy id wa st
1 0 0 60722564 23628 3484728 0 0 133 1 696 545 0 13 87 0 0
1 0 0 60722568 23628 3484824 0 0 0 0 977278 25469 0 20 80 0 0
0 0 0 60716396 23628 3484764 0 0 0 0 979997 25326 0 20 80 0 0
0 0 0 60713844 23628 3484960 0 0 0 0 981394 25249 0 20 80 0 0
2 0 0 60720468 23628 3484916 0 0 0 0 982860 25062 0 20 80 0 0
1 0 0 60721236 23628 3484856 0 0 0 0 982867 25100 0 20 80 0 0
1 0 0 60722400 23628 3484456 0 0 0 8 982698 25303 0 20 80 0 0
0 0 0 60715396 23628 3484428 0 0 0 0 981777 25176 0 20 80 0 0
0 0 0 60716520 23628 3486544 0 0 0 36 978965 27857 0 21 79 0 0
0 0 0 60719592 23628 3486516 0 0 0 22 977318 25106 0 20 80 0 0
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-17 10:12:51 +08:00
|
|
|
TCA_FQ_TIMER_SLACK, /* timer slack */
|
|
|
|
|
2020-05-01 22:07:41 +08:00
|
|
|
TCA_FQ_HORIZON, /* time horizon in us */
|
|
|
|
|
|
|
|
TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */
|
|
|
|
|
pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
to add per socket limitation.
Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.
TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.
This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.
Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit
cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)
15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.
In slow start or steady state, very few packets are throttled [1]
FQ gets a bunch of tunables as :
limit : max number of packets on whole Qdisc (default 10000)
flow_limit : max number of packets per flow (default 100)
quantum : the credit per RR round (default is 2 MTU)
initial_quantum : initial credit for new flows (default is 10 MTU)
maxrate : max per flow rate (default : unlimited)
buckets : number of RB trees (default : 1024) in hash table.
(consumes 8 bytes per bucket)
[no]pacing : disable/enable pacing (default is enable)
All of them can be changed on a live qdisc.
$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
[ quantum BYTES ] [ initial_quantum BYTES ]
[ maxrate RATE ] [ buckets NUMBER ]
[ [no]pacing ]
$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
backlog 0b 0p requeues 14
511 flows, 511 inactive, 0 throttled
110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit
[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-30 06:49:55 +08:00
|
|
|
__TCA_FQ_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_FQ_MAX (__TCA_FQ_MAX - 1)
|
|
|
|
|
|
|
|
struct tc_fq_qd_stats {
|
|
|
|
__u64 gc_flows;
|
|
|
|
__u64 highprio_packets;
|
|
|
|
__u64 tcp_retrans;
|
|
|
|
__u64 throttled;
|
|
|
|
__u64 flows_plimit;
|
|
|
|
__u64 pkts_too_long;
|
|
|
|
__u64 allocation_errors;
|
|
|
|
__s64 time_next_delayed_flow;
|
|
|
|
__u32 flows;
|
|
|
|
__u32 inactive_flows;
|
|
|
|
__u32 throttled_flows;
|
2016-09-22 23:58:55 +08:00
|
|
|
__u32 unthrottle_latency_ns;
|
2018-11-12 01:11:31 +08:00
|
|
|
__u64 ce_mark; /* packets above ce_threshold */
|
2020-05-01 22:07:41 +08:00
|
|
|
__u64 horizon_drops;
|
|
|
|
__u64 horizon_caps;
|
pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
to add per socket limitation.
Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.
TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.
This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.
Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit
cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)
15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.
In slow start or steady state, very few packets are throttled [1]
FQ gets a bunch of tunables as :
limit : max number of packets on whole Qdisc (default 10000)
flow_limit : max number of packets per flow (default 100)
quantum : the credit per RR round (default is 2 MTU)
initial_quantum : initial credit for new flows (default is 10 MTU)
maxrate : max per flow rate (default : unlimited)
buckets : number of RB trees (default : 1024) in hash table.
(consumes 8 bytes per bucket)
[no]pacing : disable/enable pacing (default is enable)
All of them can be changed on a live qdisc.
$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
[ quantum BYTES ] [ initial_quantum BYTES ]
[ maxrate RATE ] [ buckets NUMBER ]
[ [no]pacing ]
$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
backlog 0b 0p requeues 14
511 flows, 511 inactive, 0 throttled
110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit
[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-30 06:49:55 +08:00
|
|
|
};
|
net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc
This patch implements the first size-based qdisc that attempts to
differentiate between small flows and heavy-hitters. The goal is to
catch the heavy-hitters and move them to a separate queue with less
priority so that bulk traffic does not affect the latency of critical
traffic. Currently "less priority" means less weight (2:1 in
particular) in a Weighted Deficit Round Robin (WDRR) scheduler.
In essence, this patch addresses the "delay-bloat" problem due to
bloated buffers. In some systems, large queues may be necessary for
obtaining CPU efficiency, or due to the presence of unresponsive
traffic like UDP, or just a large number of connections with each
having a small amount of outstanding traffic. In these circumstances,
HHF aims to reduce the HoL blocking for latency sensitive traffic,
while not impacting the queues built up by bulk traffic. HHF can also
be used in conjunction with other AQM mechanisms such as CoDel.
To capture heavy-hitters, we implement the "multi-stage filter" design
in the following paper:
C. Estan and G. Varghese, "New Directions in Traffic Measurement and
Accounting", in ACM SIGCOMM, 2002.
Some configurable qdisc settings through 'tc':
- hhf_reset_timeout: period to reset counter values in the multi-stage
filter (default 40ms)
- hhf_admit_bytes: threshold to classify heavy-hitters
(default 128KB)
- hhf_evict_timeout: threshold to evict idle heavy-hitters
(default 1s)
- hhf_non_hh_weight: Weighted Deficit Round Robin (WDRR) weight for
non-heavy-hitters (default 2)
- hh_flows_limit: max number of heavy-hitter flow entries
(default 2048)
Note that the ratio between hhf_admit_bytes and hhf_reset_timeout
reflects the bandwidth of heavy-hitters that we attempt to capture
(25Mbps with the above default settings).
The false negative rate (heavy-hitter flows getting away unclassified)
is zero by the design of the multi-stage filter algorithm.
With 100 heavy-hitter flows, using four hashes and 4000 counters yields
a false positive rate (non-heavy-hitters mistakenly classified as
heavy-hitters) of less than 1e-4.
Signed-off-by: Terry Lam <vtlam@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-15 16:30:21 +08:00
|
|
|
|
|
|
|
/* Heavy-Hitter Filter */
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_HHF_UNSPEC,
|
|
|
|
TCA_HHF_BACKLOG_LIMIT,
|
|
|
|
TCA_HHF_QUANTUM,
|
|
|
|
TCA_HHF_HH_FLOWS_LIMIT,
|
|
|
|
TCA_HHF_RESET_TIMEOUT,
|
|
|
|
TCA_HHF_ADMIT_BYTES,
|
|
|
|
TCA_HHF_EVICT_TIMEOUT,
|
|
|
|
TCA_HHF_NON_HH_WEIGHT,
|
|
|
|
__TCA_HHF_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_HHF_MAX (__TCA_HHF_MAX - 1)
|
|
|
|
|
|
|
|
struct tc_hhf_xstats {
|
|
|
|
__u32 drop_overlimit; /* number of times max qdisc packet limit
|
|
|
|
* was hit
|
|
|
|
*/
|
|
|
|
__u32 hh_overlimit; /* number of times max heavy-hitters was hit */
|
|
|
|
__u32 hh_tot_count; /* number of captured heavy-hitters so far */
|
|
|
|
__u32 hh_cur_count; /* number of current heavy-hitters */
|
|
|
|
};
|
net: pkt_sched: PIE AQM scheme
Proportional Integral controller Enhanced (PIE) is a scheduler to address the
bufferbloat problem.
>From the IETF draft below:
" Bufferbloat is a phenomenon where excess buffers in the network cause high
latency and jitter. As more and more interactive applications (e.g. voice over
IP, real time video streaming and financial transactions) run in the Internet,
high latency and jitter degrade application performance. There is a pressing
need to design intelligent queue management schemes that can control latency and
jitter; and hence provide desirable quality of service to users.
We present here a lightweight design, PIE(Proportional Integral controller
Enhanced) that can effectively control the average queueing latency to a target
value. Simulation results, theoretical analysis and Linux testbed results have
shown that PIE can ensure low latency and achieve high link utilization under
various congestion situations. The design does not require per-packet
timestamp, so it incurs very small overhead and is simple enough to implement
in both hardware and software. "
Many thanks to Dave Taht for extensive feedback, reviews, testing and
suggestions. Thanks also to Stephen Hemminger and Eric Dumazet for reviews and
suggestions. Naeem Khademi and Dave Taht independently contributed to ECN
support.
For more information, please see technical paper about PIE in the IEEE
Conference on High Performance Switching and Routing 2013. A copy of the paper
can be found at ftp://ftpeng.cisco.com/pie/.
Please also refer to the IETF draft submission at
http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
All relevant code, documents and test scripts and results can be found at
ftp://ftpeng.cisco.com/pie/.
For problems with the iproute2/tc or Linux kernel code, please contact Vijay
Subramanian (vijaynsu@cisco.com or subramanian.vijay@gmail.com) Mythili Prabhu
(mysuryan@cisco.com)
Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Mythili Prabhu <mysuryan@cisco.com>
CC: Dave Taht <dave.taht@bufferbloat.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-05 09:33:55 +08:00
|
|
|
|
|
|
|
/* PIE */
|
|
|
|
enum {
|
|
|
|
TCA_PIE_UNSPEC,
|
|
|
|
TCA_PIE_TARGET,
|
|
|
|
TCA_PIE_LIMIT,
|
|
|
|
TCA_PIE_TUPDATE,
|
|
|
|
TCA_PIE_ALPHA,
|
|
|
|
TCA_PIE_BETA,
|
|
|
|
TCA_PIE_ECN,
|
|
|
|
TCA_PIE_BYTEMODE,
|
2019-11-20 22:13:54 +08:00
|
|
|
TCA_PIE_DQ_RATE_ESTIMATOR,
|
net: pkt_sched: PIE AQM scheme
Proportional Integral controller Enhanced (PIE) is a scheduler to address the
bufferbloat problem.
>From the IETF draft below:
" Bufferbloat is a phenomenon where excess buffers in the network cause high
latency and jitter. As more and more interactive applications (e.g. voice over
IP, real time video streaming and financial transactions) run in the Internet,
high latency and jitter degrade application performance. There is a pressing
need to design intelligent queue management schemes that can control latency and
jitter; and hence provide desirable quality of service to users.
We present here a lightweight design, PIE(Proportional Integral controller
Enhanced) that can effectively control the average queueing latency to a target
value. Simulation results, theoretical analysis and Linux testbed results have
shown that PIE can ensure low latency and achieve high link utilization under
various congestion situations. The design does not require per-packet
timestamp, so it incurs very small overhead and is simple enough to implement
in both hardware and software. "
Many thanks to Dave Taht for extensive feedback, reviews, testing and
suggestions. Thanks also to Stephen Hemminger and Eric Dumazet for reviews and
suggestions. Naeem Khademi and Dave Taht independently contributed to ECN
support.
For more information, please see technical paper about PIE in the IEEE
Conference on High Performance Switching and Routing 2013. A copy of the paper
can be found at ftp://ftpeng.cisco.com/pie/.
Please also refer to the IETF draft submission at
http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
All relevant code, documents and test scripts and results can be found at
ftp://ftpeng.cisco.com/pie/.
For problems with the iproute2/tc or Linux kernel code, please contact Vijay
Subramanian (vijaynsu@cisco.com or subramanian.vijay@gmail.com) Mythili Prabhu
(mysuryan@cisco.com)
Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Mythili Prabhu <mysuryan@cisco.com>
CC: Dave Taht <dave.taht@bufferbloat.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-05 09:33:55 +08:00
|
|
|
__TCA_PIE_MAX
|
|
|
|
};
|
|
|
|
#define TCA_PIE_MAX (__TCA_PIE_MAX - 1)
|
|
|
|
|
|
|
|
struct tc_pie_xstats {
|
2019-11-20 22:13:54 +08:00
|
|
|
__u64 prob; /* current probability */
|
|
|
|
__u32 delay; /* current delay in ms */
|
|
|
|
__u32 avg_dq_rate; /* current average dq_rate in
|
|
|
|
* bits/pie_time
|
|
|
|
*/
|
|
|
|
__u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */
|
|
|
|
__u32 packets_in; /* total number of packets enqueued */
|
|
|
|
__u32 dropped; /* packets dropped due to pie_action */
|
|
|
|
__u32 overlimit; /* dropped due to lack of space
|
|
|
|
* in queue
|
|
|
|
*/
|
|
|
|
__u32 maxq; /* maximum queue size */
|
|
|
|
__u32 ecn_mark; /* packets marked with ecn*/
|
net: pkt_sched: PIE AQM scheme
Proportional Integral controller Enhanced (PIE) is a scheduler to address the
bufferbloat problem.
>From the IETF draft below:
" Bufferbloat is a phenomenon where excess buffers in the network cause high
latency and jitter. As more and more interactive applications (e.g. voice over
IP, real time video streaming and financial transactions) run in the Internet,
high latency and jitter degrade application performance. There is a pressing
need to design intelligent queue management schemes that can control latency and
jitter; and hence provide desirable quality of service to users.
We present here a lightweight design, PIE(Proportional Integral controller
Enhanced) that can effectively control the average queueing latency to a target
value. Simulation results, theoretical analysis and Linux testbed results have
shown that PIE can ensure low latency and achieve high link utilization under
various congestion situations. The design does not require per-packet
timestamp, so it incurs very small overhead and is simple enough to implement
in both hardware and software. "
Many thanks to Dave Taht for extensive feedback, reviews, testing and
suggestions. Thanks also to Stephen Hemminger and Eric Dumazet for reviews and
suggestions. Naeem Khademi and Dave Taht independently contributed to ECN
support.
For more information, please see technical paper about PIE in the IEEE
Conference on High Performance Switching and Routing 2013. A copy of the paper
can be found at ftp://ftpeng.cisco.com/pie/.
Please also refer to the IETF draft submission at
http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
All relevant code, documents and test scripts and results can be found at
ftp://ftpeng.cisco.com/pie/.
For problems with the iproute2/tc or Linux kernel code, please contact Vijay
Subramanian (vijaynsu@cisco.com or subramanian.vijay@gmail.com) Mythili Prabhu
(mysuryan@cisco.com)
Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Mythili Prabhu <mysuryan@cisco.com>
CC: Dave Taht <dave.taht@bufferbloat.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-05 09:33:55 +08:00
|
|
|
};
|
2017-10-17 09:01:26 +08:00
|
|
|
|
2020-01-23 02:22:33 +08:00
|
|
|
/* FQ PIE */
|
|
|
|
enum {
|
|
|
|
TCA_FQ_PIE_UNSPEC,
|
|
|
|
TCA_FQ_PIE_LIMIT,
|
|
|
|
TCA_FQ_PIE_FLOWS,
|
|
|
|
TCA_FQ_PIE_TARGET,
|
|
|
|
TCA_FQ_PIE_TUPDATE,
|
|
|
|
TCA_FQ_PIE_ALPHA,
|
|
|
|
TCA_FQ_PIE_BETA,
|
|
|
|
TCA_FQ_PIE_QUANTUM,
|
|
|
|
TCA_FQ_PIE_MEMORY_LIMIT,
|
|
|
|
TCA_FQ_PIE_ECN_PROB,
|
|
|
|
TCA_FQ_PIE_ECN,
|
|
|
|
TCA_FQ_PIE_BYTEMODE,
|
|
|
|
TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
|
|
|
|
__TCA_FQ_PIE_MAX
|
|
|
|
};
|
|
|
|
#define TCA_FQ_PIE_MAX (__TCA_FQ_PIE_MAX - 1)
|
|
|
|
|
|
|
|
struct tc_fq_pie_xstats {
|
|
|
|
__u32 packets_in; /* total number of packets enqueued */
|
|
|
|
__u32 dropped; /* packets dropped due to fq_pie_action */
|
|
|
|
__u32 overlimit; /* dropped due to lack of space in queue */
|
|
|
|
__u32 overmemory; /* dropped due to lack of memory in queue */
|
|
|
|
__u32 ecn_mark; /* packets marked with ecn */
|
|
|
|
__u32 new_flow_count; /* count of new flows created by packets */
|
|
|
|
__u32 new_flows_len; /* count of flows in new list */
|
|
|
|
__u32 old_flows_len; /* count of flows in old list */
|
|
|
|
__u32 memory_usage; /* total memory across all queues */
|
|
|
|
};
|
|
|
|
|
2017-10-17 09:01:26 +08:00
|
|
|
/* CBS */
|
|
|
|
struct tc_cbs_qopt {
|
|
|
|
__u8 offload;
|
|
|
|
__u8 _pad[3];
|
|
|
|
__s32 hicredit;
|
|
|
|
__s32 locredit;
|
|
|
|
__s32 idleslope;
|
|
|
|
__s32 sendslope;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_CBS_UNSPEC,
|
|
|
|
TCA_CBS_PARMS,
|
|
|
|
__TCA_CBS_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
|
|
|
|
|
net/sched: Introduce the ETF Qdisc
The ETF (Earliest TxTime First) qdisc uses the information added
earlier in this series (the socket option SO_TXTIME and the new
role of sk_buff->tstamp) to schedule packets transmission based
on absolute time.
For some workloads, just bandwidth enforcement is not enough, and
precise control of the transmission of packets is necessary.
Example:
$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0
$ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \
clockid CLOCK_TAI
In this example, the Qdisc will provide SW best-effort for the control
of the transmission time to the network adapter, the time stamp in the
socket will be in reference to the clockid CLOCK_TAI and packets
will leave the qdisc "delta" (100000) nanoseconds before its transmission
time.
The ETF qdisc will buffer packets sorted by their txtime. It will drop
packets on enqueue() if their skbuff clockid does not match the clock
reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped
if it expires while being enqueued.
The qdisc also supports the SO_TXTIME deadline mode. For this mode, it
will dequeue a packet as soon as possible and change the skb timestamp
to 'now' during etf_dequeue().
Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid
to be configured, but it's been decided that usage of CLOCK_TAI should
be enforced until we decide to allow for other clockids to be used.
The rationale here is that PTP times are usually in the TAI scale, thus
no other clocks should be necessary. For now, the qdisc will return
EINVAL if any clocks other than CLOCK_TAI are used.
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-04 06:42:53 +08:00
|
|
|
|
|
|
|
/* ETF */
|
|
|
|
struct tc_etf_qopt {
|
|
|
|
__s32 delta;
|
|
|
|
__s32 clockid;
|
|
|
|
__u32 flags;
|
2019-06-26 06:07:13 +08:00
|
|
|
#define TC_ETF_DEADLINE_MODE_ON _BITUL(0)
|
|
|
|
#define TC_ETF_OFFLOAD_ON _BITUL(1)
|
2019-06-26 06:07:14 +08:00
|
|
|
#define TC_ETF_SKIP_SOCK_CHECK _BITUL(2)
|
net/sched: Introduce the ETF Qdisc
The ETF (Earliest TxTime First) qdisc uses the information added
earlier in this series (the socket option SO_TXTIME and the new
role of sk_buff->tstamp) to schedule packets transmission based
on absolute time.
For some workloads, just bandwidth enforcement is not enough, and
precise control of the transmission of packets is necessary.
Example:
$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0
$ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \
clockid CLOCK_TAI
In this example, the Qdisc will provide SW best-effort for the control
of the transmission time to the network adapter, the time stamp in the
socket will be in reference to the clockid CLOCK_TAI and packets
will leave the qdisc "delta" (100000) nanoseconds before its transmission
time.
The ETF qdisc will buffer packets sorted by their txtime. It will drop
packets on enqueue() if their skbuff clockid does not match the clock
reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped
if it expires while being enqueued.
The qdisc also supports the SO_TXTIME deadline mode. For this mode, it
will dequeue a packet as soon as possible and change the skb timestamp
to 'now' during etf_dequeue().
Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid
to be configured, but it's been decided that usage of CLOCK_TAI should
be enforced until we decide to allow for other clockids to be used.
The rationale here is that PTP times are usually in the TAI scale, thus
no other clocks should be necessary. For now, the qdisc will return
EINVAL if any clocks other than CLOCK_TAI are used.
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-04 06:42:53 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_ETF_UNSPEC,
|
|
|
|
TCA_ETF_PARMS,
|
|
|
|
__TCA_ETF_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
|
|
|
|
|
sched: Add Common Applications Kept Enhanced (cake) qdisc
sch_cake targets the home router use case and is intended to squeeze the
most bandwidth and latency out of even the slowest ISP links and routers,
while presenting an API simple enough that even an ISP can configure it.
Example of use on a cable ISP uplink:
tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter
To shape a cable download link (ifb and tc-mirred setup elided)
tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash
CAKE is filled with:
* A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
derived Flow Queuing system, which autoconfigures based on the bandwidth.
* A novel "triple-isolate" mode (the default) which balances per-host
and per-flow FQ even through NAT.
* An deficit based shaper, that can also be used in an unlimited mode.
* 8 way set associative hashing to reduce flow collisions to a minimum.
* A reasonable interpretation of various diffserv latency/loss tradeoffs.
* Support for zeroing diffserv markings for entering and exiting traffic.
* Support for interacting well with Docsis 3.0 shaper framing.
* Extensive support for DSL framing types.
* Support for ack filtering.
* Extensive statistics for measuring, loss, ecn markings, latency
variation.
A paper describing the design of CAKE is available at
https://arxiv.org/abs/1804.07617, and will be published at the 2018 IEEE
International Symposium on Local and Metropolitan Area Networks (LANMAN).
This patch adds the base shaper and packet scheduler, while subsequent
commits add the optional (configurable) features. The full userspace API
and most data structures are included in this commit, but options not
understood in the base version will be ignored.
Various versions baking have been available as an out of tree build for
kernel versions going back to 3.10, as the embedded router world has been
running a few years behind mainline Linux. A stable version has been
generally available on lede-17.01 and later.
sch_cake replaces a combination of iptables, tc filter, htb and fq_codel
in the sqm-scripts, with sane defaults and vastly simpler configuration.
CAKE's principal author is Jonathan Morton, with contributions from
Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, Sebastian Moeller,
Ryan Mounce, Tony Ambardar, Dean Scarff, Nils Andreas Svee, Dave Täht,
and Loganaden Velvindron.
Testing from Pete Heist, Georgios Amanakis, and the many other members of
the cake@lists.bufferbloat.net mailing list.
tc -s qdisc show dev eth2
qdisc cake 8017: root refcnt 2 bandwidth 1Gbit diffserv3 triple-isolate split-gso rtt 100.0ms noatm overhead 38 mpu 84
Sent 51504294511 bytes 37724591 pkt (dropped 6, overlimits 64958695 requeues 12)
backlog 0b 0p requeues 12
memory used: 1053008b of 15140Kb
capacity estimate: 970Mbit
min/max network layer size: 28 / 1500
min/max overhead-adjusted size: 84 / 1538
average network hdr offset: 14
Bulk Best Effort Voice
thresh 62500Kbit 1Gbit 250Mbit
target 5.0ms 5.0ms 5.0ms
interval 100.0ms 100.0ms 100.0ms
pk_delay 5us 5us 6us
av_delay 3us 2us 2us
sp_delay 2us 1us 1us
backlog 0b 0b 0b
pkts 3164050 25030267 9530280
bytes 3227519915 35396974782 12879808898
way_inds 0 8 0
way_miss 21 366 25
way_cols 0 0 0
drops 5 0 1
marks 0 0 0
ack_drop 0 0 0
sp_flows 1 3 0
bk_flows 0 1 1
un_flows 0 0 0
max_len 68130 68130 68130
Tested-by: Pete Heist <peteheist@gmail.com>
Tested-by: Georgios Amanakis <gamanakis@gmail.com>
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-06 23:37:19 +08:00
|
|
|
|
|
|
|
/* CAKE */
|
|
|
|
enum {
|
|
|
|
TCA_CAKE_UNSPEC,
|
|
|
|
TCA_CAKE_PAD,
|
|
|
|
TCA_CAKE_BASE_RATE64,
|
|
|
|
TCA_CAKE_DIFFSERV_MODE,
|
|
|
|
TCA_CAKE_ATM,
|
|
|
|
TCA_CAKE_FLOW_MODE,
|
|
|
|
TCA_CAKE_OVERHEAD,
|
|
|
|
TCA_CAKE_RTT,
|
|
|
|
TCA_CAKE_TARGET,
|
|
|
|
TCA_CAKE_AUTORATE,
|
|
|
|
TCA_CAKE_MEMORY,
|
|
|
|
TCA_CAKE_NAT,
|
|
|
|
TCA_CAKE_RAW,
|
|
|
|
TCA_CAKE_WASH,
|
|
|
|
TCA_CAKE_MPU,
|
|
|
|
TCA_CAKE_INGRESS,
|
|
|
|
TCA_CAKE_ACK_FILTER,
|
|
|
|
TCA_CAKE_SPLIT_GSO,
|
2019-03-01 23:04:05 +08:00
|
|
|
TCA_CAKE_FWMARK,
|
sched: Add Common Applications Kept Enhanced (cake) qdisc
sch_cake targets the home router use case and is intended to squeeze the
most bandwidth and latency out of even the slowest ISP links and routers,
while presenting an API simple enough that even an ISP can configure it.
Example of use on a cable ISP uplink:
tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter
To shape a cable download link (ifb and tc-mirred setup elided)
tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash
CAKE is filled with:
* A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
derived Flow Queuing system, which autoconfigures based on the bandwidth.
* A novel "triple-isolate" mode (the default) which balances per-host
and per-flow FQ even through NAT.
* An deficit based shaper, that can also be used in an unlimited mode.
* 8 way set associative hashing to reduce flow collisions to a minimum.
* A reasonable interpretation of various diffserv latency/loss tradeoffs.
* Support for zeroing diffserv markings for entering and exiting traffic.
* Support for interacting well with Docsis 3.0 shaper framing.
* Extensive support for DSL framing types.
* Support for ack filtering.
* Extensive statistics for measuring, loss, ecn markings, latency
variation.
A paper describing the design of CAKE is available at
https://arxiv.org/abs/1804.07617, and will be published at the 2018 IEEE
International Symposium on Local and Metropolitan Area Networks (LANMAN).
This patch adds the base shaper and packet scheduler, while subsequent
commits add the optional (configurable) features. The full userspace API
and most data structures are included in this commit, but options not
understood in the base version will be ignored.
Various versions baking have been available as an out of tree build for
kernel versions going back to 3.10, as the embedded router world has been
running a few years behind mainline Linux. A stable version has been
generally available on lede-17.01 and later.
sch_cake replaces a combination of iptables, tc filter, htb and fq_codel
in the sqm-scripts, with sane defaults and vastly simpler configuration.
CAKE's principal author is Jonathan Morton, with contributions from
Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, Sebastian Moeller,
Ryan Mounce, Tony Ambardar, Dean Scarff, Nils Andreas Svee, Dave Täht,
and Loganaden Velvindron.
Testing from Pete Heist, Georgios Amanakis, and the many other members of
the cake@lists.bufferbloat.net mailing list.
tc -s qdisc show dev eth2
qdisc cake 8017: root refcnt 2 bandwidth 1Gbit diffserv3 triple-isolate split-gso rtt 100.0ms noatm overhead 38 mpu 84
Sent 51504294511 bytes 37724591 pkt (dropped 6, overlimits 64958695 requeues 12)
backlog 0b 0p requeues 12
memory used: 1053008b of 15140Kb
capacity estimate: 970Mbit
min/max network layer size: 28 / 1500
min/max overhead-adjusted size: 84 / 1538
average network hdr offset: 14
Bulk Best Effort Voice
thresh 62500Kbit 1Gbit 250Mbit
target 5.0ms 5.0ms 5.0ms
interval 100.0ms 100.0ms 100.0ms
pk_delay 5us 5us 6us
av_delay 3us 2us 2us
sp_delay 2us 1us 1us
backlog 0b 0b 0b
pkts 3164050 25030267 9530280
bytes 3227519915 35396974782 12879808898
way_inds 0 8 0
way_miss 21 366 25
way_cols 0 0 0
drops 5 0 1
marks 0 0 0
ack_drop 0 0 0
sp_flows 1 3 0
bk_flows 0 1 1
un_flows 0 0 0
max_len 68130 68130 68130
Tested-by: Pete Heist <peteheist@gmail.com>
Tested-by: Georgios Amanakis <gamanakis@gmail.com>
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-06 23:37:19 +08:00
|
|
|
__TCA_CAKE_MAX
|
|
|
|
};
|
|
|
|
#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
__TCA_CAKE_STATS_INVALID,
|
|
|
|
TCA_CAKE_STATS_PAD,
|
|
|
|
TCA_CAKE_STATS_CAPACITY_ESTIMATE64,
|
|
|
|
TCA_CAKE_STATS_MEMORY_LIMIT,
|
|
|
|
TCA_CAKE_STATS_MEMORY_USED,
|
|
|
|
TCA_CAKE_STATS_AVG_NETOFF,
|
|
|
|
TCA_CAKE_STATS_MIN_NETLEN,
|
|
|
|
TCA_CAKE_STATS_MAX_NETLEN,
|
|
|
|
TCA_CAKE_STATS_MIN_ADJLEN,
|
|
|
|
TCA_CAKE_STATS_MAX_ADJLEN,
|
|
|
|
TCA_CAKE_STATS_TIN_STATS,
|
|
|
|
TCA_CAKE_STATS_DEFICIT,
|
|
|
|
TCA_CAKE_STATS_COBALT_COUNT,
|
|
|
|
TCA_CAKE_STATS_DROPPING,
|
|
|
|
TCA_CAKE_STATS_DROP_NEXT_US,
|
|
|
|
TCA_CAKE_STATS_P_DROP,
|
|
|
|
TCA_CAKE_STATS_BLUE_TIMER_US,
|
|
|
|
__TCA_CAKE_STATS_MAX
|
|
|
|
};
|
|
|
|
#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
__TCA_CAKE_TIN_STATS_INVALID,
|
|
|
|
TCA_CAKE_TIN_STATS_PAD,
|
|
|
|
TCA_CAKE_TIN_STATS_SENT_PACKETS,
|
|
|
|
TCA_CAKE_TIN_STATS_SENT_BYTES64,
|
|
|
|
TCA_CAKE_TIN_STATS_DROPPED_PACKETS,
|
|
|
|
TCA_CAKE_TIN_STATS_DROPPED_BYTES64,
|
|
|
|
TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS,
|
|
|
|
TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64,
|
|
|
|
TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS,
|
|
|
|
TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64,
|
|
|
|
TCA_CAKE_TIN_STATS_BACKLOG_PACKETS,
|
|
|
|
TCA_CAKE_TIN_STATS_BACKLOG_BYTES,
|
|
|
|
TCA_CAKE_TIN_STATS_THRESHOLD_RATE64,
|
|
|
|
TCA_CAKE_TIN_STATS_TARGET_US,
|
|
|
|
TCA_CAKE_TIN_STATS_INTERVAL_US,
|
|
|
|
TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS,
|
|
|
|
TCA_CAKE_TIN_STATS_WAY_MISSES,
|
|
|
|
TCA_CAKE_TIN_STATS_WAY_COLLISIONS,
|
|
|
|
TCA_CAKE_TIN_STATS_PEAK_DELAY_US,
|
|
|
|
TCA_CAKE_TIN_STATS_AVG_DELAY_US,
|
|
|
|
TCA_CAKE_TIN_STATS_BASE_DELAY_US,
|
|
|
|
TCA_CAKE_TIN_STATS_SPARSE_FLOWS,
|
|
|
|
TCA_CAKE_TIN_STATS_BULK_FLOWS,
|
|
|
|
TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS,
|
|
|
|
TCA_CAKE_TIN_STATS_MAX_SKBLEN,
|
|
|
|
TCA_CAKE_TIN_STATS_FLOW_QUANTUM,
|
|
|
|
__TCA_CAKE_TIN_STATS_MAX
|
|
|
|
};
|
|
|
|
#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1)
|
|
|
|
#define TC_CAKE_MAX_TINS (8)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CAKE_FLOW_NONE = 0,
|
|
|
|
CAKE_FLOW_SRC_IP,
|
|
|
|
CAKE_FLOW_DST_IP,
|
|
|
|
CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */
|
|
|
|
CAKE_FLOW_FLOWS,
|
|
|
|
CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */
|
|
|
|
CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */
|
|
|
|
CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */
|
|
|
|
CAKE_FLOW_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CAKE_DIFFSERV_DIFFSERV3 = 0,
|
|
|
|
CAKE_DIFFSERV_DIFFSERV4,
|
|
|
|
CAKE_DIFFSERV_DIFFSERV8,
|
|
|
|
CAKE_DIFFSERV_BESTEFFORT,
|
|
|
|
CAKE_DIFFSERV_PRECEDENCE,
|
|
|
|
CAKE_DIFFSERV_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CAKE_ACK_NONE = 0,
|
|
|
|
CAKE_ACK_FILTER,
|
|
|
|
CAKE_ACK_AGGRESSIVE,
|
|
|
|
CAKE_ACK_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CAKE_ATM_NONE = 0,
|
|
|
|
CAKE_ATM_ATM,
|
|
|
|
CAKE_ATM_PTM,
|
|
|
|
CAKE_ATM_MAX
|
|
|
|
};
|
|
|
|
|
tc: Add support for configuring the taprio scheduler
This traffic scheduler allows traffic classes states (transmission
allowed/not allowed, in the simplest case) to be scheduled, according
to a pre-generated time sequence. This is the basis of the IEEE
802.1Qbv specification.
Example configuration:
tc qdisc replace dev enp3s0 parent root handle 100 taprio \
num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \
queues 1@0 1@1 2@2 \
base-time 1528743495910289987 \
sched-entry S 01 300000 \
sched-entry S 02 300000 \
sched-entry S 04 300000 \
clockid CLOCK_TAI
The configuration format is similar to mqprio. The main difference is
the presence of a schedule, built by multiple "sched-entry"
definitions, each entry has the following format:
sched-entry <CMD> <GATE MASK> <INTERVAL>
The only supported <CMD> is "S", which means "SetGateStates",
following the IEEE 802.1Qbv-2015 definition (Table 8-6). <GATE MASK>
is a bitmask where each bit is a associated with a traffic class, so
bit 0 (the least significant bit) being "on" means that traffic class
0 is "active" for that schedule entry. <INTERVAL> is a time duration
in nanoseconds that specifies for how long that state defined by <CMD>
and <GATE MASK> should be held before moving to the next entry.
This schedule is circular, that is, after the last entry is executed
it starts from the first one, indefinitely.
The other parameters can be defined as follows:
- base-time: specifies the instant when the schedule starts, if
'base-time' is a time in the past, the schedule will start at
base-time + (N * cycle-time)
where N is the smallest integer so the resulting time is greater
than "now", and "cycle-time" is the sum of all the intervals of the
entries in the schedule;
- clockid: specifies the reference clock to be used;
The parameters should be similar to what the IEEE 802.1Q family of
specification defines.
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-29 08:59:43 +08:00
|
|
|
|
|
|
|
/* TAPRIO */
|
|
|
|
enum {
|
|
|
|
TC_TAPRIO_CMD_SET_GATES = 0x00,
|
|
|
|
TC_TAPRIO_CMD_SET_AND_HOLD = 0x01,
|
|
|
|
TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02,
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_TAPRIO_SCHED_ENTRY_UNSPEC,
|
|
|
|
TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */
|
|
|
|
TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */
|
|
|
|
TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */
|
|
|
|
TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */
|
|
|
|
__TCA_TAPRIO_SCHED_ENTRY_MAX,
|
|
|
|
};
|
|
|
|
#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1)
|
|
|
|
|
|
|
|
/* The format for schedule entry list is:
|
|
|
|
* [TCA_TAPRIO_SCHED_ENTRY_LIST]
|
|
|
|
* [TCA_TAPRIO_SCHED_ENTRY]
|
|
|
|
* [TCA_TAPRIO_SCHED_ENTRY_CMD]
|
|
|
|
* [TCA_TAPRIO_SCHED_ENTRY_GATES]
|
|
|
|
* [TCA_TAPRIO_SCHED_ENTRY_INTERVAL]
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
TCA_TAPRIO_SCHED_UNSPEC,
|
|
|
|
TCA_TAPRIO_SCHED_ENTRY,
|
|
|
|
__TCA_TAPRIO_SCHED_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1)
|
|
|
|
|
taprio: Add support adding an admin schedule
The IEEE 802.1Q-2018 defines two "types" of schedules, the "Oper" (from
operational?) and "Admin" ones. Up until now, 'taprio' only had
support for the "Oper" one, added when the qdisc is created. This adds
support for the "Admin" one, which allows the .change() operation to
be supported.
Just for clarification, some quick (and dirty) definitions, the "Oper"
schedule is the currently (as in this instant) running one, and it's
read-only. The "Admin" one is the one that the system configurator has
installed, it can be changed, and it will be "promoted" to "Oper" when
it's 'base-time' is reached.
The idea behing this patch is that calling something like the below,
(after taprio is already configured with an initial schedule):
$ tc qdisc change taprio dev IFACE parent root \
base-time X \
sched-entry <CMD> <GATES> <INTERVAL> \
...
Will cause a new admin schedule to be created and programmed to be
"promoted" to "Oper" at instant X. If an "Admin" schedule already
exists, it will be overwritten with the new parameters.
Up until now, there was some code that was added to ease the support
of changing a single entry of a schedule, but was ultimately unused.
Now, that we have support for "change" with more well thought
semantics, updating a single entry seems to be less useful.
So we remove what is in practice dead code, and return a "not
supported" error if the user tries to use it. If changing a single
entry would make the user's life easier we may ressurrect this idea,
but at this point, removing it simplifies the code.
For now, only the schedule specific bits are allowed to be added for a
new schedule, that means that 'clockid', 'num_tc', 'map' and 'queues'
cannot be modified.
Example:
$ tc qdisc change dev IFACE parent root handle 100 taprio \
base-time $BASE_TIME \
sched-entry S 00 500000 \
sched-entry S 0f 500000 \
clockid CLOCK_TAI
The only change in the netlink API introduced by this change is the
introduction of an "admin" type in the response to a dump request,
that type allows userspace to separate the "oper" schedule from the
"admin" schedule. If userspace doesn't support the "admin" type, it
will only display the "oper" schedule.
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-30 06:48:31 +08:00
|
|
|
/* The format for the admin sched (dump only):
|
|
|
|
* [TCA_TAPRIO_SCHED_ADMIN_SCHED]
|
|
|
|
* [TCA_TAPRIO_ATTR_SCHED_BASE_TIME]
|
|
|
|
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]
|
|
|
|
* [TCA_TAPRIO_ATTR_SCHED_ENTRY]
|
|
|
|
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD]
|
|
|
|
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES]
|
|
|
|
* [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL]
|
|
|
|
*/
|
|
|
|
|
2020-03-24 12:19:20 +08:00
|
|
|
#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST _BITUL(0)
|
|
|
|
#define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD _BITUL(1)
|
taprio: Add support for txtime-assist mode
Currently, we are seeing non-critical packets being transmitted outside of
their timeslice. We can confirm that the packets are being dequeued at the
right time. So, the delay is induced in the hardware side. The most likely
reason is the hardware queues are starving the lower priority queues.
In order to improve the performance of taprio, we will be making use of the
txtime feature provided by the ETF qdisc. For all the packets which do not
have the SO_TXTIME option set, taprio will set the transmit timestamp (set
in skb->tstamp) in this mode. TAPrio Qdisc will ensure that the transmit
time for the packet is set to when the gate is open. If SO_TXTIME is set,
the TAPrio qdisc will validate whether the timestamp (in skb->tstamp)
occurs when the gate corresponding to skb's traffic class is open.
Following two parameters added to support this mode:
- flags: used to enable txtime-assist mode. Will also be used to enable
other modes (like hardware offloading) later.
- txtime-delay: This indicates the minimum time it will take for the packet
to hit the wire. This is useful in determining whether we can transmit
the packet in the remaining time if the gate corresponding to the packet is
currently open.
An example configuration for enabling txtime-assist:
tc qdisc replace dev eth0 parent root handle 100 taprio \\
num_tc 3 \\
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\
queues 1@0 1@0 1@0 \\
base-time 1558653424279842568 \\
sched-entry S 01 300000 \\
sched-entry S 02 300000 \\
sched-entry S 04 400000 \\
flags 0x1 \\
txtime-delay 40000 \\
clockid CLOCK_TAI
tc qdisc replace dev $IFACE parent 100:1 etf skip_sock_check \\
offload delta 200000 clockid CLOCK_TAI
Note that all the traffic classes are mapped to the same queue. This is
only possible in taprio when txtime-assist is enabled. Also, note that the
ETF Qdisc is enabled with offload mode set.
In this mode, if the packet's traffic class is open and the complete packet
can be transmitted, taprio will try to transmit the packet immediately.
This will be done by setting skb->tstamp to current_time + the time delta
indicated in the txtime-delay parameter. This parameter indicates the time
taken (in software) for packet to reach the network adapter.
If the packet cannot be transmitted in the current interval or if the
packet's traffic is not currently transmitting, the skb->tstamp is set to
the next available timestamp value. This is tracked in the next_launchtime
parameter in the struct sched_entry.
The behaviour w.r.t admin and oper schedules is not changed from what is
present in software mode.
The transmit time is already known in advance. So, we do not need the HR
timers to advance the schedule and wakeup the dequeue side of taprio. So,
HR timer won't be run when this mode is enabled.
Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-26 06:07:17 +08:00
|
|
|
|
tc: Add support for configuring the taprio scheduler
This traffic scheduler allows traffic classes states (transmission
allowed/not allowed, in the simplest case) to be scheduled, according
to a pre-generated time sequence. This is the basis of the IEEE
802.1Qbv specification.
Example configuration:
tc qdisc replace dev enp3s0 parent root handle 100 taprio \
num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \
queues 1@0 1@1 2@2 \
base-time 1528743495910289987 \
sched-entry S 01 300000 \
sched-entry S 02 300000 \
sched-entry S 04 300000 \
clockid CLOCK_TAI
The configuration format is similar to mqprio. The main difference is
the presence of a schedule, built by multiple "sched-entry"
definitions, each entry has the following format:
sched-entry <CMD> <GATE MASK> <INTERVAL>
The only supported <CMD> is "S", which means "SetGateStates",
following the IEEE 802.1Qbv-2015 definition (Table 8-6). <GATE MASK>
is a bitmask where each bit is a associated with a traffic class, so
bit 0 (the least significant bit) being "on" means that traffic class
0 is "active" for that schedule entry. <INTERVAL> is a time duration
in nanoseconds that specifies for how long that state defined by <CMD>
and <GATE MASK> should be held before moving to the next entry.
This schedule is circular, that is, after the last entry is executed
it starts from the first one, indefinitely.
The other parameters can be defined as follows:
- base-time: specifies the instant when the schedule starts, if
'base-time' is a time in the past, the schedule will start at
base-time + (N * cycle-time)
where N is the smallest integer so the resulting time is greater
than "now", and "cycle-time" is the sum of all the intervals of the
entries in the schedule;
- clockid: specifies the reference clock to be used;
The parameters should be similar to what the IEEE 802.1Q family of
specification defines.
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-29 08:59:43 +08:00
|
|
|
enum {
|
|
|
|
TCA_TAPRIO_ATTR_UNSPEC,
|
|
|
|
TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
|
|
|
|
TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */
|
|
|
|
TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */
|
|
|
|
TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */
|
|
|
|
TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
|
|
|
|
TCA_TAPRIO_PAD,
|
taprio: Add support adding an admin schedule
The IEEE 802.1Q-2018 defines two "types" of schedules, the "Oper" (from
operational?) and "Admin" ones. Up until now, 'taprio' only had
support for the "Oper" one, added when the qdisc is created. This adds
support for the "Admin" one, which allows the .change() operation to
be supported.
Just for clarification, some quick (and dirty) definitions, the "Oper"
schedule is the currently (as in this instant) running one, and it's
read-only. The "Admin" one is the one that the system configurator has
installed, it can be changed, and it will be "promoted" to "Oper" when
it's 'base-time' is reached.
The idea behing this patch is that calling something like the below,
(after taprio is already configured with an initial schedule):
$ tc qdisc change taprio dev IFACE parent root \
base-time X \
sched-entry <CMD> <GATES> <INTERVAL> \
...
Will cause a new admin schedule to be created and programmed to be
"promoted" to "Oper" at instant X. If an "Admin" schedule already
exists, it will be overwritten with the new parameters.
Up until now, there was some code that was added to ease the support
of changing a single entry of a schedule, but was ultimately unused.
Now, that we have support for "change" with more well thought
semantics, updating a single entry seems to be less useful.
So we remove what is in practice dead code, and return a "not
supported" error if the user tries to use it. If changing a single
entry would make the user's life easier we may ressurrect this idea,
but at this point, removing it simplifies the code.
For now, only the schedule specific bits are allowed to be added for a
new schedule, that means that 'clockid', 'num_tc', 'map' and 'queues'
cannot be modified.
Example:
$ tc qdisc change dev IFACE parent root handle 100 taprio \
base-time $BASE_TIME \
sched-entry S 00 500000 \
sched-entry S 0f 500000 \
clockid CLOCK_TAI
The only change in the netlink API introduced by this change is the
introduction of an "admin" type in the response to a dump request,
that type allows userspace to separate the "oper" schedule from the
"admin" schedule. If userspace doesn't support the "admin" type, it
will only display the "oper" schedule.
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-30 06:48:31 +08:00
|
|
|
TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
|
2019-04-30 06:48:32 +08:00
|
|
|
TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
|
2019-04-30 06:48:33 +08:00
|
|
|
TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
|
taprio: Add support for txtime-assist mode
Currently, we are seeing non-critical packets being transmitted outside of
their timeslice. We can confirm that the packets are being dequeued at the
right time. So, the delay is induced in the hardware side. The most likely
reason is the hardware queues are starving the lower priority queues.
In order to improve the performance of taprio, we will be making use of the
txtime feature provided by the ETF qdisc. For all the packets which do not
have the SO_TXTIME option set, taprio will set the transmit timestamp (set
in skb->tstamp) in this mode. TAPrio Qdisc will ensure that the transmit
time for the packet is set to when the gate is open. If SO_TXTIME is set,
the TAPrio qdisc will validate whether the timestamp (in skb->tstamp)
occurs when the gate corresponding to skb's traffic class is open.
Following two parameters added to support this mode:
- flags: used to enable txtime-assist mode. Will also be used to enable
other modes (like hardware offloading) later.
- txtime-delay: This indicates the minimum time it will take for the packet
to hit the wire. This is useful in determining whether we can transmit
the packet in the remaining time if the gate corresponding to the packet is
currently open.
An example configuration for enabling txtime-assist:
tc qdisc replace dev eth0 parent root handle 100 taprio \\
num_tc 3 \\
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\
queues 1@0 1@0 1@0 \\
base-time 1558653424279842568 \\
sched-entry S 01 300000 \\
sched-entry S 02 300000 \\
sched-entry S 04 400000 \\
flags 0x1 \\
txtime-delay 40000 \\
clockid CLOCK_TAI
tc qdisc replace dev $IFACE parent 100:1 etf skip_sock_check \\
offload delta 200000 clockid CLOCK_TAI
Note that all the traffic classes are mapped to the same queue. This is
only possible in taprio when txtime-assist is enabled. Also, note that the
ETF Qdisc is enabled with offload mode set.
In this mode, if the packet's traffic class is open and the complete packet
can be transmitted, taprio will try to transmit the packet immediately.
This will be done by setting skb->tstamp to current_time + the time delta
indicated in the txtime-delay parameter. This parameter indicates the time
taken (in software) for packet to reach the network adapter.
If the packet cannot be transmitted in the current interval or if the
packet's traffic is not currently transmitting, the skb->tstamp is set to
the next available timestamp value. This is tracked in the next_launchtime
parameter in the struct sched_entry.
The behaviour w.r.t admin and oper schedules is not changed from what is
present in software mode.
The transmit time is already known in advance. So, we do not need the HR
timers to advance the schedule and wakeup the dequeue side of taprio. So,
HR timer won't be run when this mode is enabled.
Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-26 06:07:17 +08:00
|
|
|
TCA_TAPRIO_ATTR_FLAGS, /* u32 */
|
2019-07-17 03:52:18 +08:00
|
|
|
TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
|
tc: Add support for configuring the taprio scheduler
This traffic scheduler allows traffic classes states (transmission
allowed/not allowed, in the simplest case) to be scheduled, according
to a pre-generated time sequence. This is the basis of the IEEE
802.1Qbv specification.
Example configuration:
tc qdisc replace dev enp3s0 parent root handle 100 taprio \
num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \
queues 1@0 1@1 2@2 \
base-time 1528743495910289987 \
sched-entry S 01 300000 \
sched-entry S 02 300000 \
sched-entry S 04 300000 \
clockid CLOCK_TAI
The configuration format is similar to mqprio. The main difference is
the presence of a schedule, built by multiple "sched-entry"
definitions, each entry has the following format:
sched-entry <CMD> <GATE MASK> <INTERVAL>
The only supported <CMD> is "S", which means "SetGateStates",
following the IEEE 802.1Qbv-2015 definition (Table 8-6). <GATE MASK>
is a bitmask where each bit is a associated with a traffic class, so
bit 0 (the least significant bit) being "on" means that traffic class
0 is "active" for that schedule entry. <INTERVAL> is a time duration
in nanoseconds that specifies for how long that state defined by <CMD>
and <GATE MASK> should be held before moving to the next entry.
This schedule is circular, that is, after the last entry is executed
it starts from the first one, indefinitely.
The other parameters can be defined as follows:
- base-time: specifies the instant when the schedule starts, if
'base-time' is a time in the past, the schedule will start at
base-time + (N * cycle-time)
where N is the smallest integer so the resulting time is greater
than "now", and "cycle-time" is the sum of all the intervals of the
entries in the schedule;
- clockid: specifies the reference clock to be used;
The parameters should be similar to what the IEEE 802.1Q family of
specification defines.
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-29 08:59:43 +08:00
|
|
|
__TCA_TAPRIO_ATTR_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1)
|
|
|
|
|
2019-12-18 22:55:13 +08:00
|
|
|
/* ETS */
|
|
|
|
|
|
|
|
#define TCQ_ETS_MAX_BANDS 16
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCA_ETS_UNSPEC,
|
|
|
|
TCA_ETS_NBANDS, /* u8 */
|
|
|
|
TCA_ETS_NSTRICT, /* u8 */
|
|
|
|
TCA_ETS_QUANTA, /* nested TCA_ETS_QUANTA_BAND */
|
|
|
|
TCA_ETS_QUANTA_BAND, /* u32 */
|
|
|
|
TCA_ETS_PRIOMAP, /* nested TCA_ETS_PRIOMAP_BAND */
|
|
|
|
TCA_ETS_PRIOMAP_BAND, /* u8 */
|
|
|
|
__TCA_ETS_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|