From 473900a504e510cf9175876de8892ad1e3e7efab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:50 -0800 Subject: [PATCH 1/5] tcp_cubic: optimize hystart_update() We do not care which bit in ca->found is set. We avoid accessing hystart and hystart_detect unless really needed, possibly avoiding one cache line miss. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 1b3d032a4df2..297936033c33 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -381,9 +381,6 @@ static void hystart_update(struct sock *sk, u32 delay) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (ca->found & hystart_detect) - return; - if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); @@ -391,7 +388,7 @@ static void hystart_update(struct sock *sk, u32 delay) if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { ca->last_ack = now; if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { - ca->found |= HYSTART_ACK_TRAIN; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), @@ -412,7 +409,7 @@ static void hystart_update(struct sock *sk, u32 delay) } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { - ca->found |= HYSTART_DELAY; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), @@ -450,7 +447,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tcp_in_slow_start(tp) && + if (!ca->found && hystart && tcp_in_slow_start(tp) && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } From 35821fc2b41c51161e503bade3630603668dd3af Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:51 -0800 Subject: [PATCH 2/5] tcp_cubic: remove one conditional from hystart_update() If we initialize ca->curr_rtt to ~0U, we do not need to test for zero value in hystart_update() We only read ca->curr_rtt if at least HYSTART_MIN_SAMPLES have been processed, and thus ca->curr_rtt will have a sane value. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 297936033c33..5eff762acd7f 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -133,7 +133,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) ca->round_start = ca->last_ack = bictcp_clock(); ca->end_seq = tp->snd_nxt; - ca->curr_rtt = 0; + ca->curr_rtt = ~0U; ca->sample_cnt = 0; } @@ -402,7 +402,7 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) + if (ca->curr_rtt > delay) ca->curr_rtt = delay; ca->sample_cnt++; From cff04e2da308c522f654237b45dd64248fe8d1fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:52 -0800 Subject: [PATCH 3/5] tcp_cubic: switch bictcp_clock() to usec resolution Current 1ms clock feeds ca->round_start, ca->delay_min, ca->last_ack. This is quite problematic for data-center flows, where delay_min is way below 1 ms. This means Hystart Train detection triggers every time jiffies value is updated, since "((s32)(now - ca->round_start) > ca->delay_min >> 4)" expression becomes true. This kind of random behavior can be solved by reusing the existing usec timestamp that TCP keeps in tp->tcp_mstamp Note that a followup patch will tweak things a bit, because during slow start, GRO aggregation on receivers naturally increases the RTT as TSO packets gradually come to ~64KB size. To recap, right after this patch CUBIC Hystart train detection is more aggressive, since short RTT flows might exit slow start at cwnd = 20, instead of being possibly unbounded. Following patch will address this problem. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 5eff762acd7f..068775b91fb5 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -40,8 +40,8 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (4U<<3) -#define HYSTART_DELAY_MAX (16U<<3) +#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; @@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; -static int hystart_ack_delta __read_mostly = 2; +static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); -module_param(hystart_ack_delta, int, 0644); -MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); +module_param(hystart_ack_delta_us, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -89,7 +89,7 @@ struct bictcp { u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay (msec << 3) */ + u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } -static inline u32 bictcp_clock(void) +static inline u32 bictcp_clock_us(const struct sock *sk) { -#if HZ < 1000 - return ktime_to_ms(ktime_get_real()); -#else - return jiffies_to_msecs(jiffies); -#endif + return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) @@ -131,7 +127,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_ack = bictcp_clock(); + ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; ca->curr_rtt = ~0U; ca->sample_cnt = 0; @@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) */ t = (s32)(tcp_jiffies32 - ca->epoch_start); - t += msecs_to_jiffies(ca->delay_min >> 3); + t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); @@ -382,12 +378,12 @@ static void hystart_update(struct sock *sk, u32 delay) struct bictcp *ca = inet_csk_ca(sk); if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now = bictcp_clock(); + u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { + if ((s32)(now - ca->round_start) > ca->delay_min >> 1) { ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); @@ -421,9 +417,6 @@ static void hystart_update(struct sock *sk, u32 delay) } } -/* Track delayed acknowledgment ratio using sliding window - * ratio = (15*ratio + sample) / 16 - */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -438,7 +431,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; + delay = sample->rtt_us; if (delay == 0) delay = 1; From 42f3a8aaae66d31d87850fb4b02979a0fc5dc541 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:53 -0800 Subject: [PATCH 4/5] tcp_cubic: tweak Hystart detection for short RTT flows After switching ca->delay_min to usec resolution, we exit slow start prematurely for very low RTT flows, setting snd_ssthresh to 20. The reason is that delay_min is fed with RTT of small packet trains. Then as cwnd is increased, TCP sends bigger TSO packets. LRO/GRO aggregation and/or interrupt mitigation strategies on receiver tend to inflate RTT samples. Fix this by adding to delay_min the expected delay of two TSO packets, given current pacing rate. Tested: Sender uses pfifo_fast qdisc Before : $ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 11348 11707 11562 11428 11773 11534 9878 11693 10597 10968 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 200 0.0 After : $ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 14877 14517 15797 18466 17376 14833 17558 17933 16039 18059 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 1670 0.0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 068775b91fb5..0e5428ed04fe 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -436,8 +436,27 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) delay = 1; /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) - ca->delay_min = delay; + if (ca->delay_min == 0 || ca->delay_min > delay) { + unsigned long rate = READ_ONCE(sk->sk_pacing_rate); + + /* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, + * since during slow start we begin with small TSO packets + * and could lower ca->delay_min too much. + * Ideally even with a very small RTT we would like to have + * at least one TSO packet being sent and received by GRO, + * and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at + * this point. + * We cap the cushion to 1ms. + */ + if (rate) + delay += min_t(u64, USEC_PER_MSEC, + div64_ul((u64)GSO_MAX_SIZE * + 4 * USEC_PER_SEC, rate)); + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; + } /* hystart triggers when cwnd is larger than some threshold */ if (!ca->found && hystart && tcp_in_slow_start(tp) && From ede656e8465839530c3287c7f54adf75dc2b9563 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:54 -0800 Subject: [PATCH 5/5] tcp_cubic: make Hystart aware of pacing For years we disabled Hystart ACK train detection at Google because it was fooled by TCP pacing. ACK train detection uses a simple heuristic, detecting if we receive ACK past half the RTT, to exit slow start before hitting the bottleneck and experience massive drops. But pacing by design might delay packets up to RTT/2, so we need to tweak the Hystart logic to be aware of this extra delay. Tested: Added a 100 usec delay at receiver. Before: nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 9117 7057 9553 8300 7030 6849 9533 10126 6876 8473 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 1230 0.0 After : nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 9845 10103 10866 11096 11936 11487 11773 12188 11066 11894 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 6462 0.0 Disabling Hystart ACK Train detection gives similar numbers echo 2 >/sys/module/tcp_cubic/parameters/hystart_detect nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 11173 10954 12455 10627 11578 11583 11222 10880 10665 11366 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0e5428ed04fe..d02bb283c689 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -376,6 +376,7 @@ static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); + u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); @@ -383,7 +384,17 @@ static void hystart_update(struct sock *sk, u32 delay) /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 1) { + + threshold = ca->delay_min; + /* Hystart ack train triggers if we get ack past + * ca->delay_min/2. + * Pacing might have delayed packets up to RTT/2 + * during slow start. + */ + if (sk->sk_pacing_status == SK_PACING_NONE) + threshold >>= 1; + + if ((s32)(now - ca->round_start) > threshold) { ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT);