linux-sg2042/include/uapi/linux/pkt_sched.h

793 lines
18 KiB
C
Raw Normal View History

#ifndef __LINUX_PKT_SCHED_H
#define __LINUX_PKT_SCHED_H
#include <linux/types.h>
/* Logical priority bands not depending on specific packet scheduler.
Every scheduler will map them to real traffic classes, if it has
no more precise mechanism to classify packets.
These numbers have no special meaning, though their coincidence
with obsolete IPv6 values is not occasional :-). New IPv6 drafts
preferred full anarchy inspired by diffserv group.
Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy
class, actually, as rule it will be handled with more care than
filler or even bulk.
*/
#define TC_PRIO_BESTEFFORT 0
#define TC_PRIO_FILLER 1
#define TC_PRIO_BULK 2
#define TC_PRIO_INTERACTIVE_BULK 4
#define TC_PRIO_INTERACTIVE 6
#define TC_PRIO_CONTROL 7
#define TC_PRIO_MAX 15
/* Generic queue statistics, available for all the elements.
Particular schedulers may have also their private records.
*/
struct tc_stats {
__u64 bytes; /* Number of enqueued bytes */
__u32 packets; /* Number of enqueued packets */
__u32 drops; /* Packets dropped because of lack of resources */
__u32 overlimits; /* Number of throttle events when this
* flow goes out of allocated bandwidth */
__u32 bps; /* Current flow byte rate */
__u32 pps; /* Current flow packet rate */
__u32 qlen;
__u32 backlog;
};
struct tc_estimator {
signed char interval;
unsigned char ewma_log;
};
/* "Handles"
---------
All the traffic control objects have 32bit identifiers, or "handles".
They can be considered as opaque numbers from user API viewpoint,
but actually they always consist of two fields: major and
minor numbers, which are interpreted by kernel specially,
that may be used by applications, though not recommended.
F.e. qdisc handles always have minor number equal to zero,
classes (or flows) have major equal to parent qdisc major, and
minor uniquely identifying class inside qdisc.
Macros to manipulate handles:
*/
#define TC_H_MAJ_MASK (0xFFFF0000U)
#define TC_H_MIN_MASK (0x0000FFFFU)
#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK)
#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK)
#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK))
#define TC_H_UNSPEC (0U)
#define TC_H_ROOT (0xFFFFFFFFU)
#define TC_H_INGRESS (0xFFFFFFF1U)
/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */
enum tc_link_layer {
TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */
TC_LINKLAYER_ETHERNET,
TC_LINKLAYER_ATM,
};
#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */
struct tc_ratespec {
unsigned char cell_log;
__u8 linklayer; /* lower 4 bits */
unsigned short overhead;
short cell_align;
unsigned short mpu;
__u32 rate;
};
#define TC_RTAB_SIZE 1024
struct tc_sizespec {
unsigned char cell_log;
unsigned char size_log;
short cell_align;
int overhead;
unsigned int linklayer;
unsigned int mpu;
unsigned int mtu;
unsigned int tsize;
};
enum {
TCA_STAB_UNSPEC,
TCA_STAB_BASE,
TCA_STAB_DATA,
__TCA_STAB_MAX
};
#define TCA_STAB_MAX (__TCA_STAB_MAX - 1)
/* FIFO section */
struct tc_fifo_qopt {
__u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */
};
/* PRIO section */
#define TCQ_PRIO_BANDS 16
#define TCQ_MIN_PRIO_BANDS 2
struct tc_prio_qopt {
int bands; /* Number of bands */
__u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */
};
/* MULTIQ section */
struct tc_multiq_qopt {
__u16 bands; /* Number of bands */
__u16 max_bands; /* Maximum number of queues */
};
net/sched: sch_plug - Queue traffic until an explicit release command The qdisc supports two operations - plug and unplug. When the qdisc receives a plug command via netlink request, packets arriving henceforth are buffered until a corresponding unplug command is received. Depending on the type of unplug command, the queue can be unplugged indefinitely or selectively. This qdisc can be used to implement output buffering, an essential functionality required for consistent recovery in checkpoint based fault-tolerance systems. Output buffering enables speculative execution by allowing generated network traffic to be rolled back. It is used to provide network protection for Xen Guests in the Remus high availability project, available as part of Xen. This module is generic enough to be used by any other system that wishes to add speculative execution and output buffering to its applications. This module was originally available in the linux 2.6.32 PV-OPS tree, used as dom0 for Xen. For more information, please refer to http://nss.cs.ubc.ca/remus/ and http://wiki.xensource.com/xenwiki/Remus Changes in V3: * Removed debug output (printk) on queue overflow * Added TCQ_PLUG_RELEASE_INDEFINITE - that allows the user to use this qdisc, for simple plug/unplug operations. * Use of packet counts instead of pointers to keep track of the buffers in the queue. Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> Signed-off-by: Brendan Cully <brendan@cs.ubc.ca> [author of the code in the linux 2.6.32 pvops tree] Signed-off-by: David S. Miller <davem@davemloft.net>
2012-02-05 21:51:32 +08:00
/* PLUG section */
#define TCQ_PLUG_BUFFER 0
#define TCQ_PLUG_RELEASE_ONE 1
#define TCQ_PLUG_RELEASE_INDEFINITE 2
#define TCQ_PLUG_LIMIT 3
struct tc_plug_qopt {
/* TCQ_PLUG_BUFFER: Inset a plug into the queue and
* buffer any incoming packets
* TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
* to beginning of the next plug.
* TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
* Stop buffering packets until the next TCQ_PLUG_BUFFER
* command is received (just act as a pass-thru queue).
* TCQ_PLUG_LIMIT: Increase/decrease queue size
*/
int action;
__u32 limit;
};
/* TBF section */
struct tc_tbf_qopt {
struct tc_ratespec rate;
struct tc_ratespec peakrate;
__u32 limit;
__u32 buffer;
__u32 mtu;
};
enum {
TCA_TBF_UNSPEC,
TCA_TBF_PARMS,
TCA_TBF_RTAB,
TCA_TBF_PTAB,
TCA_TBF_RATE64,
TCA_TBF_PRATE64,
__TCA_TBF_MAX,
};
#define TCA_TBF_MAX (__TCA_TBF_MAX - 1)
/* TEQL section */
/* TEQL does not require any parameters */
/* SFQ section */
struct tc_sfq_qopt {
unsigned quantum; /* Bytes per round allocated to flow */
int perturb_period; /* Period of hash perturbation */
__u32 limit; /* Maximal packets in queue */
unsigned divisor; /* Hash divisor */
unsigned flows; /* Maximal number of flows */
};
net_sched: sfq: add optional RED on top of SFQ Adds an optional Random Early Detection on each SFQ flow queue. Traditional SFQ limits count of packets, while RED permits to also control number of bytes per flow, and adds ECN capability as well. 1) We dont handle the idle time management in this RED implementation, since each 'new flow' begins with a null qavg. We really want to address backlogged flows. 2) if headdrop is selected, we try to ecn mark first packet instead of currently enqueued packet. This gives faster feedback for tcp flows compared to traditional RED [ marking the last packet in queue ] Example of use : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \ limit 3000 headdrop flows 512 divisor 16384 \ redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 ewma 6 min 8000b max 60000b probability 0.2 ecn prob_mark 0 prob_mark_head 4876 prob_drop 6131 forced_mark 0 forced_mark_head 0 forced_drop 0 Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007 requeues 0) rate 99483Kbit 8219pps backlog 689392b 456p requeues 0 In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled flows, we can see number of packets CE marked is smaller than number of drops (for non ECN flows) If same test is run, without RED, we can check backlog is much bigger. qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0) rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0 Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Stephen Hemminger <shemminger@vyatta.com> CC: Dave Taht <dave.taht@gmail.com> Tested-by: Dave Taht <dave.taht@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-01-06 14:31:44 +08:00
struct tc_sfqred_stats {
__u32 prob_drop; /* Early drops, below max threshold */
__u32 forced_drop; /* Early drops, after max threshold */
__u32 prob_mark; /* Marked packets, below max threshold */
__u32 forced_mark; /* Marked packets, after max threshold */
__u32 prob_mark_head; /* Marked packets, below max threshold */
__u32 forced_mark_head;/* Marked packets, after max threshold */
};
struct tc_sfq_qopt_v1 {
struct tc_sfq_qopt v0;
unsigned int depth; /* max number of packets per flow */
unsigned int headdrop;
net_sched: sfq: add optional RED on top of SFQ Adds an optional Random Early Detection on each SFQ flow queue. Traditional SFQ limits count of packets, while RED permits to also control number of bytes per flow, and adds ECN capability as well. 1) We dont handle the idle time management in this RED implementation, since each 'new flow' begins with a null qavg. We really want to address backlogged flows. 2) if headdrop is selected, we try to ecn mark first packet instead of currently enqueued packet. This gives faster feedback for tcp flows compared to traditional RED [ marking the last packet in queue ] Example of use : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \ limit 3000 headdrop flows 512 divisor 16384 \ redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 ewma 6 min 8000b max 60000b probability 0.2 ecn prob_mark 0 prob_mark_head 4876 prob_drop 6131 forced_mark 0 forced_mark_head 0 forced_drop 0 Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007 requeues 0) rate 99483Kbit 8219pps backlog 689392b 456p requeues 0 In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled flows, we can see number of packets CE marked is smaller than number of drops (for non ECN flows) If same test is run, without RED, we can check backlog is much bigger. qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0) rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0 Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Stephen Hemminger <shemminger@vyatta.com> CC: Dave Taht <dave.taht@gmail.com> Tested-by: Dave Taht <dave.taht@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-01-06 14:31:44 +08:00
/* SFQRED parameters */
__u32 limit; /* HARD maximal flow queue length (bytes) */
__u32 qth_min; /* Min average length threshold (bytes) */
__u32 qth_max; /* Max average length threshold (bytes) */
unsigned char Wlog; /* log(W) */
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
unsigned char Scell_log; /* cell size for idle damping */
unsigned char flags;
__u32 max_P; /* probability, high resolution */
/* SFQRED stats */
struct tc_sfqred_stats stats;
};
struct tc_sfq_xstats {
__s32 allot;
};
/* RED section */
enum {
TCA_RED_UNSPEC,
TCA_RED_PARMS,
TCA_RED_STAB,
sch_red: Adaptative RED AQM Adaptative RED AQM for linux, based on paper from Sally FLoyd, Ramakrishna Gummadi, and Scott Shenker, August 2001 : http://icir.org/floyd/papers/adaptiveRed.pdf Goal of Adaptative RED is to make max_p a dynamic value between 1% and 50% to reach the target average queue : (max_th - min_th) / 2 Every 500 ms: if (avg > target and max_p <= 0.5) increase max_p : max_p += alpha; else if (avg < target and max_p >= 0.01) decrease max_p : max_p *= beta; target :[min_th + 0.4*(min_th - max_th), min_th + 0.6*(min_th - max_th)]. alpha : min(0.01, max_p / 4) beta : 0.9 max_P is a Q0.32 fixed point number (unsigned, with 32 bits mantissa) Changes against our RED implementation are : max_p is no longer a negative power of two (1/(2^Plog)), but a Q0.32 fixed point number, to allow full range described in Adatative paper. To deliver a random number, we now use a reciprocal divide (thats really a multiply), but this operation is done once per marked/droped packet when in RED_BETWEEN_TRESH window, so added cost (compared to previous AND operation) is near zero. dump operation gives current max_p value in a new TCA_RED_MAX_P attribute. Example on a 10Mbit link : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 8sec red \ limit 400000 min 30000 max 90000 avpkt 1000 \ burst 55 ecn adaptative bandwidth 10Mbit # tc -s -d qdisc show dev eth3 ... qdisc red 10: parent 1:1 limit 400000b min 30000b max 90000b ecn adaptative ewma 5 max_p=0.113335 Scell_log 15 Sent 50414282 bytes 34504 pkt (dropped 35, overlimits 1392 requeues 0) rate 9749Kbit 831pps backlog 72056b 16p requeues 0 marked 1357 early 35 pdrop 0 other 0 Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-12-08 14:06:03 +08:00
TCA_RED_MAX_P,
__TCA_RED_MAX,
};
#define TCA_RED_MAX (__TCA_RED_MAX - 1)
struct tc_red_qopt {
__u32 limit; /* HARD maximal queue length (bytes) */
__u32 qth_min; /* Min average length threshold (bytes) */
__u32 qth_max; /* Max average length threshold (bytes) */
unsigned char Wlog; /* log(W) */
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
unsigned char Scell_log; /* cell size for idle damping */
unsigned char flags;
sch_red: Adaptative RED AQM Adaptative RED AQM for linux, based on paper from Sally FLoyd, Ramakrishna Gummadi, and Scott Shenker, August 2001 : http://icir.org/floyd/papers/adaptiveRed.pdf Goal of Adaptative RED is to make max_p a dynamic value between 1% and 50% to reach the target average queue : (max_th - min_th) / 2 Every 500 ms: if (avg > target and max_p <= 0.5) increase max_p : max_p += alpha; else if (avg < target and max_p >= 0.01) decrease max_p : max_p *= beta; target :[min_th + 0.4*(min_th - max_th), min_th + 0.6*(min_th - max_th)]. alpha : min(0.01, max_p / 4) beta : 0.9 max_P is a Q0.32 fixed point number (unsigned, with 32 bits mantissa) Changes against our RED implementation are : max_p is no longer a negative power of two (1/(2^Plog)), but a Q0.32 fixed point number, to allow full range described in Adatative paper. To deliver a random number, we now use a reciprocal divide (thats really a multiply), but this operation is done once per marked/droped packet when in RED_BETWEEN_TRESH window, so added cost (compared to previous AND operation) is near zero. dump operation gives current max_p value in a new TCA_RED_MAX_P attribute. Example on a 10Mbit link : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 8sec red \ limit 400000 min 30000 max 90000 avpkt 1000 \ burst 55 ecn adaptative bandwidth 10Mbit # tc -s -d qdisc show dev eth3 ... qdisc red 10: parent 1:1 limit 400000b min 30000b max 90000b ecn adaptative ewma 5 max_p=0.113335 Scell_log 15 Sent 50414282 bytes 34504 pkt (dropped 35, overlimits 1392 requeues 0) rate 9749Kbit 831pps backlog 72056b 16p requeues 0 marked 1357 early 35 pdrop 0 other 0 Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-12-08 14:06:03 +08:00
#define TC_RED_ECN 1
#define TC_RED_HARDDROP 2
#define TC_RED_ADAPTATIVE 4
};
struct tc_red_xstats {
__u32 early; /* Early drops */
__u32 pdrop; /* Drops due to queue limits */
__u32 other; /* Drops due to drop() calls */
__u32 marked; /* Marked packets */
};
/* GRED section */
#define MAX_DPs 16
enum {
TCA_GRED_UNSPEC,
TCA_GRED_PARMS,
TCA_GRED_STAB,
TCA_GRED_DPS,
TCA_GRED_MAX_P,
__TCA_GRED_MAX,
};
#define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
struct tc_gred_qopt {
__u32 limit; /* HARD maximal queue length (bytes) */
__u32 qth_min; /* Min average length threshold (bytes) */
__u32 qth_max; /* Max average length threshold (bytes) */
__u32 DP; /* up to 2^32 DPs */
__u32 backlog;
__u32 qave;
__u32 forced;
__u32 early;
__u32 other;
__u32 pdrop;
__u8 Wlog; /* log(W) */
__u8 Plog; /* log(P_max/(qth_max-qth_min)) */
__u8 Scell_log; /* cell size for idle damping */
__u8 prio; /* prio of this VQ */
__u32 packets;
__u32 bytesin;
};
/* gred setup */
struct tc_gred_sopt {
__u32 DPs;
__u32 def_DP;
__u8 grio;
__u8 flags;
__u16 pad1;
};
/* CHOKe section */
enum {
TCA_CHOKE_UNSPEC,
TCA_CHOKE_PARMS,
TCA_CHOKE_STAB,
TCA_CHOKE_MAX_P,
__TCA_CHOKE_MAX,
};
#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
struct tc_choke_qopt {
__u32 limit; /* Hard queue length (packets) */
__u32 qth_min; /* Min average threshold (packets) */
__u32 qth_max; /* Max average threshold (packets) */
unsigned char Wlog; /* log(W) */
unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
unsigned char Scell_log; /* cell size for idle damping */
unsigned char flags; /* see RED flags */
};
struct tc_choke_xstats {
__u32 early; /* Early drops */
__u32 pdrop; /* Drops due to queue limits */
__u32 other; /* Drops due to drop() calls */
__u32 marked; /* Marked packets */
__u32 matched; /* Drops due to flow match */
};
/* HTB section */
#define TC_HTB_NUMPRIO 8
#define TC_HTB_MAXDEPTH 8
#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */
struct tc_htb_opt {
struct tc_ratespec rate;
struct tc_ratespec ceil;
__u32 buffer;
__u32 cbuffer;
__u32 quantum;
__u32 level; /* out only */
__u32 prio;
};
struct tc_htb_glob {
__u32 version; /* to match HTB/TC */
__u32 rate2quantum; /* bps->quantum divisor */
__u32 defcls; /* default class number */
__u32 debug; /* debug flags */
/* stats */
__u32 direct_pkts; /* count of non shaped packets */
};
enum {
TCA_HTB_UNSPEC,
TCA_HTB_PARMS,
TCA_HTB_INIT,
TCA_HTB_CTAB,
TCA_HTB_RTAB,
TCA_HTB_DIRECT_QLEN,
TCA_HTB_RATE64,
TCA_HTB_CEIL64,
__TCA_HTB_MAX,
};
#define TCA_HTB_MAX (__TCA_HTB_MAX - 1)
struct tc_htb_xstats {
__u32 lends;
__u32 borrows;
__u32 giants; /* too big packets (rate will not be accurate) */
__u32 tokens;
__u32 ctokens;
};
/* HFSC section */
struct tc_hfsc_qopt {
__u16 defcls; /* default class */
};
struct tc_service_curve {
__u32 m1; /* slope of the first segment in bps */
__u32 d; /* x-projection of the first segment in us */
__u32 m2; /* slope of the second segment in bps */
};
struct tc_hfsc_stats {
__u64 work; /* total work done */
__u64 rtwork; /* work done by real-time criteria */
__u32 period; /* current period */
__u32 level; /* class level in hierarchy */
};
enum {
TCA_HFSC_UNSPEC,
TCA_HFSC_RSC,
TCA_HFSC_FSC,
TCA_HFSC_USC,
__TCA_HFSC_MAX,
};
#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1)
/* CBQ section */
#define TC_CBQ_MAXPRIO 8
#define TC_CBQ_MAXLEVEL 8
#define TC_CBQ_DEF_EWMA 5
struct tc_cbq_lssopt {
unsigned char change;
unsigned char flags;
#define TCF_CBQ_LSS_BOUNDED 1
#define TCF_CBQ_LSS_ISOLATED 2
unsigned char ewma_log;
unsigned char level;
#define TCF_CBQ_LSS_FLAGS 1
#define TCF_CBQ_LSS_EWMA 2
#define TCF_CBQ_LSS_MAXIDLE 4
#define TCF_CBQ_LSS_MINIDLE 8
#define TCF_CBQ_LSS_OFFTIME 0x10
#define TCF_CBQ_LSS_AVPKT 0x20
__u32 maxidle;
__u32 minidle;
__u32 offtime;
__u32 avpkt;
};
struct tc_cbq_wrropt {
unsigned char flags;
unsigned char priority;
unsigned char cpriority;
unsigned char __reserved;
__u32 allot;
__u32 weight;
};
struct tc_cbq_ovl {
unsigned char strategy;
#define TC_CBQ_OVL_CLASSIC 0
#define TC_CBQ_OVL_DELAY 1
#define TC_CBQ_OVL_LOWPRIO 2
#define TC_CBQ_OVL_DROP 3
#define TC_CBQ_OVL_RCLASSIC 4
unsigned char priority2;
__u16 pad;
__u32 penalty;
};
struct tc_cbq_police {
unsigned char police;
unsigned char __res1;
unsigned short __res2;
};
struct tc_cbq_fopt {
__u32 split;
__u32 defmap;
__u32 defchange;
};
struct tc_cbq_xstats {
__u32 borrows;
__u32 overactions;
__s32 avgidle;
__s32 undertime;
};
enum {
TCA_CBQ_UNSPEC,
TCA_CBQ_LSSOPT,
TCA_CBQ_WRROPT,
TCA_CBQ_FOPT,
TCA_CBQ_OVL_STRATEGY,
TCA_CBQ_RATE,
TCA_CBQ_RTAB,
TCA_CBQ_POLICE,
__TCA_CBQ_MAX,
};
#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1)
/* dsmark section */
enum {
TCA_DSMARK_UNSPEC,
TCA_DSMARK_INDICES,
TCA_DSMARK_DEFAULT_INDEX,
TCA_DSMARK_SET_TC_INDEX,
TCA_DSMARK_MASK,
TCA_DSMARK_VALUE,
__TCA_DSMARK_MAX,
};
#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1)
/* ATM section */
enum {
TCA_ATM_UNSPEC,
TCA_ATM_FD, /* file/socket descriptor */
TCA_ATM_PTR, /* pointer to descriptor - later */
TCA_ATM_HDR, /* LL header */
TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */
TCA_ATM_ADDR, /* PVC address (for output only) */
TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */
__TCA_ATM_MAX,
};
#define TCA_ATM_MAX (__TCA_ATM_MAX - 1)
/* Network emulator */
enum {
TCA_NETEM_UNSPEC,
TCA_NETEM_CORR,
TCA_NETEM_DELAY_DIST,
TCA_NETEM_REORDER,
TCA_NETEM_CORRUPT,
TCA_NETEM_LOSS,
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
TCA_NETEM_RATE,
TCA_NETEM_ECN,
__TCA_NETEM_MAX,
};
#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1)
struct tc_netem_qopt {
__u32 latency; /* added delay (us) */
__u32 limit; /* fifo limit (packets) */
__u32 loss; /* random packet loss (0=none ~0=100%) */
__u32 gap; /* re-ordering gap (0 for none) */
__u32 duplicate; /* random packet dup (0=none ~0=100%) */
__u32 jitter; /* random jitter in latency (us) */
};
struct tc_netem_corr {
__u32 delay_corr; /* delay correlation */
__u32 loss_corr; /* packet loss correlation */
__u32 dup_corr; /* duplicate correlation */
};
struct tc_netem_reorder {
__u32 probability;
__u32 correlation;
};
struct tc_netem_corrupt {
__u32 probability;
__u32 correlation;
};
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
struct tc_netem_rate {
__u32 rate; /* byte/s */
__s32 packet_overhead;
__u32 cell_size;
__s32 cell_overhead;
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
};
enum {
NETEM_LOSS_UNSPEC,
NETEM_LOSS_GI, /* General Intuitive - 4 state model */
NETEM_LOSS_GE, /* Gilbert Elliot models */
__NETEM_LOSS_MAX
};
#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1)
/* State transition probabilities for 4 state model */
struct tc_netem_gimodel {
__u32 p13;
__u32 p31;
__u32 p32;
__u32 p14;
__u32 p23;
};
/* Gilbert-Elliot models */
struct tc_netem_gemodel {
__u32 p;
__u32 r;
__u32 h;
__u32 k1;
};
#define NETEM_DIST_SCALE 8192
#define NETEM_DIST_MAX 16384
/* DRR */
enum {
TCA_DRR_UNSPEC,
TCA_DRR_QUANTUM,
__TCA_DRR_MAX
};
#define TCA_DRR_MAX (__TCA_DRR_MAX - 1)
struct tc_drr_stats {
__u32 deficit;
};
net_sched: implement a root container qdisc sch_mqprio This implements a mqprio queueing discipline that by default creates a pfifo_fast qdisc per tx queue and provides the needed configuration interface. Using the mqprio qdisc the number of tcs currently in use along with the range of queues alloted to each class can be configured. By default skbs are mapped to traffic classes using the skb priority. This mapping is configurable. Configurable parameters, struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_BITMASK + 1]; __u8 hw; __u16 count[TC_MAX_QUEUE]; __u16 offset[TC_MAX_QUEUE]; }; Here the count/offset pairing give the queue alignment and the prio_tc_map gives the mapping from skb->priority to tc. The hw bit determines if the hardware should configure the count and offset values. If the hardware bit is set then the operation will fail if the hardware does not implement the ndo_setup_tc operation. This is to avoid undetermined states where the hardware may or may not control the queue mapping. Also minimal bounds checking is done on the count/offset to verify a queue does not exceed num_tx_queues and that queue ranges do not overlap. Otherwise it is left to user policy or hardware configuration to create useful mappings. It is expected that hardware QOS schemes can be implemented by creating appropriate mappings of queues in ndo_tc_setup(). One expected use case is drivers will use the ndo_setup_tc to map queue ranges onto 802.1Q traffic classes. This provides a generic mechanism to map network traffic onto these traffic classes and removes the need for lower layer drivers to know specifics about traffic types. Signed-off-by: John Fastabend <john.r.fastabend@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-01-17 16:06:09 +08:00
/* MQPRIO */
#define TC_QOPT_BITMASK 15
#define TC_QOPT_MAX_QUEUE 16
struct tc_mqprio_qopt {
__u8 num_tc;
__u8 prio_tc_map[TC_QOPT_BITMASK + 1];
__u8 hw;
__u16 count[TC_QOPT_MAX_QUEUE];
__u16 offset[TC_QOPT_MAX_QUEUE];
};
net_sched: SFB flow scheduler This is the Stochastic Fair Blue scheduler, based on work from : W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: A New Class of Active Queue Management Algorithms. U. Michigan CSE-TR-387-99, April 1999. http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf This implementation is based on work done by Juliusz Chroboczek General SFB algorithm can be found in figure 14, page 15: B[l][n] : L x N array of bins (L levels, N bins per level) enqueue() Calculate hash function values h{0}, h{1}, .. h{L-1} Update bins at each level for i = 0 to L - 1 if (B[i][h{i}].qlen > bin_size) B[i][h{i}].p_mark += p_increment; else if (B[i][h{i}].qlen == 0) B[i][h{i}].p_mark -= p_decrement; p_min = min(B[0][h{0}].p_mark ... B[L-1][h{L-1}].p_mark); if (p_min == 1.0) ratelimit(); else mark/drop with probabilty p_min; I did the adaptation of Juliusz code to meet current kernel standards, and various changes to address previous comments : http://thread.gmane.org/gmane.linux.network/90225 http://thread.gmane.org/gmane.linux.network/90375 Default flow classifier is the rxhash introduced by RPS in 2.6.35, but we can use an external flow classifier if wanted. tc qdisc add dev $DEV parent 1:11 handle 11: \ est 0.5sec 2sec sfb limit 128 tc filter add dev $DEV protocol ip parent 11: handle 3 \ flow hash keys dst divisor 1024 Notes: 1) SFB default child qdisc is pfifo_fast. It can be changed by another qdisc but a child qdisc MUST not drop a packet previously queued. This is because SFB needs to handle a dequeued packet in order to maintain its virtual queue states. pfifo_head_drop or CHOKe should not be used. 2) ECN is enabled by default, unlike RED/CHOKe/GRED With help from Patrick McHardy & Andi Kleen Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Juliusz Chroboczek <Juliusz.Chroboczek@pps.jussieu.fr> CC: Stephen Hemminger <shemminger@vyatta.com> CC: Patrick McHardy <kaber@trash.net> CC: Andi Kleen <andi@firstfloor.org> CC: John W. Linville <linville@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-02-23 18:56:17 +08:00
/* SFB */
enum {
TCA_SFB_UNSPEC,
TCA_SFB_PARMS,
__TCA_SFB_MAX,
};
#define TCA_SFB_MAX (__TCA_SFB_MAX - 1)
/*
* Note: increment, decrement are Q0.16 fixed-point values.
*/
struct tc_sfb_qopt {
__u32 rehash_interval; /* delay between hash move, in ms */
__u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */
__u32 max; /* max len of qlen_min */
__u32 bin_size; /* maximum queue length per bin */
__u32 increment; /* probability increment, (d1 in Blue) */
__u32 decrement; /* probability decrement, (d2 in Blue) */
__u32 limit; /* max SFB queue length */
__u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */
__u32 penalty_burst;
};
struct tc_sfb_xstats {
__u32 earlydrop;
__u32 penaltydrop;
__u32 bucketdrop;
__u32 queuedrop;
__u32 childdrop; /* drops in child qdisc */
__u32 marked;
__u32 maxqlen;
__u32 maxprob;
__u32 avgprob;
};
#define SFB_MAX_PROB 0xFFFF
/* QFQ */
enum {
TCA_QFQ_UNSPEC,
TCA_QFQ_WEIGHT,
TCA_QFQ_LMAX,
__TCA_QFQ_MAX
};
#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1)
struct tc_qfq_stats {
__u32 weight;
__u32 lmax;
};
codel: Controlled Delay AQM An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson. http://queue.acm.org/detail.cfm?id=2209336 This AQM main input is no longer queue size in bytes or packets, but the delay packets stay in (FIFO) queue. As we don't have infinite memory, we still can drop packets in enqueue() in case of massive load, but mean of CoDel is to drop packets in dequeue(), using a control law based on two simple parameters : target : target sojourn time (default 5ms) interval : width of moving time window (default 100ms) Based on initial work from Dave Taht. Refactored to help future codel inclusion as a plugin for other linux qdisc (FQ_CODEL, ...), like RED. include/net/codel.h contains codel algorithm as close as possible than Kathleen reference. net/sched/sch_codel.c contains the linux qdisc specific glue. Separate structures permit a memory efficient implementation of fq_codel (to be sent as a separate work) : Each flow has its own struct codel_vars. timestamps are taken at enqueue() time with 1024 ns precision, allowing a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses usec as base unit. Selected packets are dropped, unless ECN is enabled and packets can get ECN mark instead. Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and tg3 drivers (BQL enabled). Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ] [ interval TIME ] [ ecn ] qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0) rate 202365Kbit 16708pps backlog 113550b 75p requeues 0 count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us maxpacket 1514 ecn_mark 84399 drop_overlimit 0 CoDel must be seen as a base module, and should be used keeping in mind there is still a FIFO queue. So a typical setup will probably need a hierarchy of several qdiscs and packet classifiers to be able to meet whatever constraints a user might have. One possible example would be to use fq_codel, which combines Fair Queueing and CoDel, in replacement of sfq / sfq_red. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Dave Taht <dave.taht@bufferbloat.net> Cc: Kathleen Nichols <nichols@pollere.com> Cc: Van Jacobson <van@pollere.net> Cc: Tom Herbert <therbert@google.com> Cc: Matt Mathis <mattmathis@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Stephen Hemminger <shemminger@vyatta.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 15:51:25 +08:00
/* CODEL */
enum {
TCA_CODEL_UNSPEC,
TCA_CODEL_TARGET,
TCA_CODEL_LIMIT,
TCA_CODEL_INTERVAL,
TCA_CODEL_ECN,
__TCA_CODEL_MAX
};
#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1)
struct tc_codel_xstats {
__u32 maxpacket; /* largest packet we've seen so far */
__u32 count; /* how many drops we've done since the last time we
* entered dropping state
*/
__u32 lastcount; /* count at entry to dropping state */
__u32 ldelay; /* in-queue delay seen by most recently dequeued packet */
__s32 drop_next; /* time to drop next packet */
__u32 drop_overlimit; /* number of time max qdisc packet limit was hit */
__u32 ecn_mark; /* number of packets we ECN marked instead of dropped */
__u32 dropping; /* are we in dropping state ? */
};
fq_codel: Fair Queue Codel AQM Fair Queue Codel packet scheduler Principles : - Packets are classified (internal classifier or external) on flows. - This is a Stochastic model (as we use a hash, several flows might be hashed on same slot) - Each flow has a CoDel managed queue. - Flows are linked onto two (Round Robin) lists, so that new flows have priority on old ones. - For a given flow, packets are not reordered (CoDel uses a FIFO) - head drops only. - ECN capability is on by default. - Very low memory footprint (64 bytes per flow) tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ] [ target TIME ] [ interval TIME ] [ noecn ] [ quantum BYTES ] defaults : 1024 flows, 10240 packets limit, quantum : device MTU target : 5ms (CoDel default) interval : 100ms (CoDel default) Impressive results on load : class htb 1:1 root leaf 10: prio 0 quantum 1514 rate 200000Kbit ceil 200000Kbit burst 1475b/8 mpu 0b overhead 0b cburst 1475b/8 mpu 0b overhead 0b level 0 Sent 43304920109 bytes 33063109 pkt (dropped 0, overlimits 0 requeues 0) rate 201691Kbit 28595pps backlog 0b 312p requeues 0 lended: 33063109 borrowed: 0 giants: 0 tokens: -912 ctokens: -912 class fq_codel 10:1735 parent 10: (dropped 1292, overlimits 0 requeues 0) backlog 15140b 10p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 7.1ms class fq_codel 10:4524 parent 10: (dropped 1291, overlimits 0 requeues 0) backlog 16654b 11p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 7.1ms class fq_codel 10:4e74 parent 10: (dropped 1290, overlimits 0 requeues 0) backlog 6056b 4p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 6.4ms dropping drop_next 92.0ms class fq_codel 10:628a parent 10: (dropped 1289, overlimits 0 requeues 0) backlog 7570b 5p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 5.4ms dropping drop_next 90.9ms class fq_codel 10:a4b3 parent 10: (dropped 302, overlimits 0 requeues 0) backlog 16654b 11p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 7.1ms class fq_codel 10:c3c2 parent 10: (dropped 1284, overlimits 0 requeues 0) backlog 13626b 9p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 5.9ms class fq_codel 10:d331 parent 10: (dropped 299, overlimits 0 requeues 0) backlog 15140b 10p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 7.0ms class fq_codel 10:d526 parent 10: (dropped 12160, overlimits 0 requeues 0) backlog 35870b 211p requeues 0 deficit 1508 count 12160 lastcount 1 ldelay 15.3ms dropping drop_next 247us class fq_codel 10:e2c6 parent 10: (dropped 1288, overlimits 0 requeues 0) backlog 15140b 10p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 7.1ms class fq_codel 10:eab5 parent 10: (dropped 1285, overlimits 0 requeues 0) backlog 16654b 11p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 5.9ms class fq_codel 10:f220 parent 10: (dropped 1289, overlimits 0 requeues 0) backlog 15140b 10p requeues 0 deficit 1514 count 1 lastcount 1 ldelay 7.1ms qdisc htb 1: root refcnt 6 r2q 10 default 1 direct_packets_stat 0 ver 3.17 Sent 43331086547 bytes 33092812 pkt (dropped 0, overlimits 66063544 requeues 71) rate 201697Kbit 28602pps backlog 0b 260p requeues 71 qdisc fq_codel 10: parent 1:1 limit 10240p flows 65536 target 5.0ms interval 100.0ms ecn Sent 43331086547 bytes 33092812 pkt (dropped 949359, overlimits 0 requeues 0) rate 201697Kbit 28602pps backlog 189352b 260p requeues 0 maxpacket 1514 drop_overlimit 0 new_flow_count 5582 ecn_mark 125593 new_flows_len 0 old_flows_len 11 PING 172.30.42.18 (172.30.42.18) 56(84) bytes of data. 64 bytes from 172.30.42.18: icmp_req=1 ttl=64 time=0.227 ms 64 bytes from 172.30.42.18: icmp_req=2 ttl=64 time=0.165 ms 64 bytes from 172.30.42.18: icmp_req=3 ttl=64 time=0.166 ms 64 bytes from 172.30.42.18: icmp_req=4 ttl=64 time=0.151 ms 64 bytes from 172.30.42.18: icmp_req=5 ttl=64 time=0.164 ms 64 bytes from 172.30.42.18: icmp_req=6 ttl=64 time=0.172 ms 64 bytes from 172.30.42.18: icmp_req=7 ttl=64 time=0.175 ms 64 bytes from 172.30.42.18: icmp_req=8 ttl=64 time=0.183 ms 64 bytes from 172.30.42.18: icmp_req=9 ttl=64 time=0.158 ms 64 bytes from 172.30.42.18: icmp_req=10 ttl=64 time=0.200 ms 10 packets transmitted, 10 received, 0% packet loss, time 8999ms rtt min/avg/max/mdev = 0.151/0.176/0.227/0.022 ms Much better than SFQ because of priority given to new flows, and fast path dirtying less cache lines. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-11 17:30:50 +08:00
/* FQ_CODEL */
enum {
TCA_FQ_CODEL_UNSPEC,
TCA_FQ_CODEL_TARGET,
TCA_FQ_CODEL_LIMIT,
TCA_FQ_CODEL_INTERVAL,
TCA_FQ_CODEL_ECN,
TCA_FQ_CODEL_FLOWS,
TCA_FQ_CODEL_QUANTUM,
__TCA_FQ_CODEL_MAX
};
#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1)
enum {
TCA_FQ_CODEL_XSTATS_QDISC,
TCA_FQ_CODEL_XSTATS_CLASS,
};
struct tc_fq_codel_qd_stats {
__u32 maxpacket; /* largest packet we've seen so far */
__u32 drop_overlimit; /* number of time max qdisc
* packet limit was hit
*/
__u32 ecn_mark; /* number of packets we ECN marked
* instead of being dropped
*/
__u32 new_flow_count; /* number of time packets
* created a 'new flow'
*/
__u32 new_flows_len; /* count of flows in new list */
__u32 old_flows_len; /* count of flows in old list */
};
struct tc_fq_codel_cl_stats {
__s32 deficit;
__u32 ldelay; /* in-queue delay seen by most recently
* dequeued packet
*/
__u32 count;
__u32 lastcount;
__u32 dropping;
__s32 drop_next;
};
struct tc_fq_codel_xstats {
__u32 type;
union {
struct tc_fq_codel_qd_stats qdisc_stats;
struct tc_fq_codel_cl_stats class_stats;
};
};
pkt_sched: fq: Fair Queue packet scheduler - Uses perfect flow match (not stochastic hash like SFQ/FQ_codel) - Uses the new_flow/old_flow separation from FQ_codel - New flows get an initial credit allowing IW10 without added delay. - Special FIFO queue for high prio packets (no need for PRIO + FQ) - Uses a hash table of RB trees to locate the flows at enqueue() time - Smart on demand gc (at enqueue() time, RB tree lookup evicts old unused flows) - Dynamic memory allocations. - Designed to allow millions of concurrent flows per Qdisc. - Small memory footprint : ~8K per Qdisc, and 104 bytes per flow. - Single high resolution timer for throttled flows (if any). - One RB tree to link throttled flows. - Ability to have a max rate per flow. We might add a socket option to add per socket limitation. Attempts have been made to add TCP pacing in TCP stack, but this seems to add complex code to an already complex stack. TCP pacing is welcomed for flows having idle times, as the cwnd permits TCP stack to queue a possibly large number of packets. This removes the 'slow start after idle' choice, hitting badly large BDP flows, and applications delivering chunks of data as video streams. Nicely spaced packets : Here interface is 10Gbit, but flow bottleneck is ~20Mbit cwin is big, yet FQ avoids the typical bursts generated by TCP (as in netperf TCP_RR -- -r 100000,100000) 15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115> 15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115> 15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115> 15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115> 15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115> 15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115> 15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115> 15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805> 15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115> 15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> 15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115> TCP timestamps show that most packets from B were queued in the same ms timeframe (TSval 1159799{3,4}), but FQ managed to send them right in time to avoid a big burst. In slow start or steady state, very few packets are throttled [1] FQ gets a bunch of tunables as : limit : max number of packets on whole Qdisc (default 10000) flow_limit : max number of packets per flow (default 100) quantum : the credit per RR round (default is 2 MTU) initial_quantum : initial credit for new flows (default is 10 MTU) maxrate : max per flow rate (default : unlimited) buckets : number of RB trees (default : 1024) in hash table. (consumes 8 bytes per bucket) [no]pacing : disable/enable pacing (default is enable) All of them can be changed on a live qdisc. $ tc qd add dev eth0 root fq help Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ] [ quantum BYTES ] [ initial_quantum BYTES ] [ maxrate RATE ] [ buckets NUMBER ] [ [no]pacing ] $ tc -s -d qd qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14) backlog 0b 0p requeues 14 511 flows, 511 inactive, 0 throttled 110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit [1] Except if initial srtt is overestimated, as if using cached srtt in tcp metrics. We'll provide a fix for this issue. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-30 06:49:55 +08:00
/* FQ */
enum {
TCA_FQ_UNSPEC,
TCA_FQ_PLIMIT, /* limit of total number of packets in queue */
TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */
TCA_FQ_QUANTUM, /* RR quantum */
TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */
TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */
TCA_FQ_FLOW_DEFAULT_RATE,/* for sockets with unspecified sk_rate,
* use the following rate
*/
TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */
TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */
__TCA_FQ_MAX
};
#define TCA_FQ_MAX (__TCA_FQ_MAX - 1)
struct tc_fq_qd_stats {
__u64 gc_flows;
__u64 highprio_packets;
__u64 tcp_retrans;
__u64 throttled;
__u64 flows_plimit;
__u64 pkts_too_long;
__u64 allocation_errors;
__s64 time_next_delayed_flow;
__u32 flows;
__u32 inactive_flows;
__u32 throttled_flows;
__u32 pad;
};
#endif