diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 371b3b45fd5c..fe0d9b44d6fc 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -139,8 +139,8 @@ struct inet_connection_sock { } icsk_mtup; u32 icsk_user_timeout; - u64 icsk_ca_priv[88 / sizeof(u64)]; -#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64)) + u64 icsk_ca_priv[104 / sizeof(u64)]; +#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64)) }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 0f497fc49c3f..56be7d27f208 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -115,6 +115,14 @@ struct bbr { unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + unused_c:6; }; #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ @@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8; /* If we estimate we're policed, use lt_bw for this many round trips: */ static const u32 bbr_lt_bw_max_rtts = 48; +/* Gain factor for adding extra_acked to target cwnd: */ +static const int bbr_extra_acked_gain = BBR_UNIT; +/* Window length of extra_acked window. */ +static const u32 bbr_extra_acked_win_rtts = 5; +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; +/* Time period for clamping cwnd increment due to ack aggregation */ +static const u32 bbr_extra_acked_max_us = 100 * 1000; + static void bbr_check_probe_rtt_done(struct sock *sk); /* Do we estimate that STARTUP filled the pipe? */ @@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk) return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); } +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + /* Return rate in bytes per second, optionally with a gain. * The order here is chosen carefully to avoid overflow of u64. This should * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. @@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) if (event == CA_EVENT_TX_START && tp->app_limited) { bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; /* Avoid pointless buffer overflows: pace at est. bw if we don't * need more speed (we're restarting from idle and app-limited). */ @@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) } } -/* Find target cwnd. Right-size the cwnd based on min RTT and the - * estimated bottleneck bandwidth: +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * cwnd = bw * min_rtt * gain = BDP * gain + * bdp = bw * min_rtt * gain * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT * measurements (e.g., delayed ACKs or other ACK compression effects). This * noise may cause BBR to under-estimate the rate. - * - * To achieve full performance in high-speed paths, we budget enough cwnd to - * fit full-sized skbs in-flight on both end hosts to fully utilize the path: - * - one skb in sending host Qdisc, - * - one skb in sending host TSO/GSO engine - * - one skb being received by receiver host LRO/GRO/delayed-ACK engine - * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because - * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, - * which allows 2 outstanding 2-packet sequences, to try to keep pipe - * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) { struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd; + u32 bdp; u64 w; /* If we've never had a valid RTT sample, cap cwnd at the initial @@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us; /* Apply a gain to the given value, then remove the BW_SCALE shift. */ - cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); /* Allow enough full-sized skbs in flight to utilize end systems. */ cwnd += 3 * bbr_tso_segs_goal(sk); @@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; } +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight, gain); + + return inflight; +} + /* With pacing at lower layers, there's often less data "in the network" than * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), * we often have several skbs queued in the pacing layer with a pre-scheduled @@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) return inflight_at_edt - interval_delivered; } +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) goto done; + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ - target_cwnd = bbr_target_cwnd(sk, bw, gain); if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ cwnd = min(cwnd + acked, target_cwnd); else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) @@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain > BBR_UNIT) return is_full_length && (rs->losses || /* perhaps pacing_gain*BDP won't fit */ - inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); /* A pacing_gain < 1.0 tries to drain extra queue we added if bw * probing didn't find more bw. If inflight falls to match BDP then we * estimate queue is drained; persisting would underutilize the pipe. */ return is_full_length || - inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); + inflight <= bbr_inflight(sk, bw, BBR_UNIT); } static void bbr_advance_cycle_phase(struct sock *sk) @@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) } } +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + /* Estimate when the pipe is full, using the change in delivery rate: BBR * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited @@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ tcp_sk(sk)->snd_ssthresh = - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ } @@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk) static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); + bbr_update_ack_aggregation(sk, rs); bbr_update_cycle_phase(sk, rs); bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); @@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk) bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); }